In [171]:
import pandas as pd

In [220]:
df = pd.read_csv(filepath_or_buffer='iris_data.csv', delimiter=",")
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [221]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   target        150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [222]:
def clean_data(df: pd.DataFrame):
    print('before droping the null valued columns')
    print(df.isna().sum())
    df = df.dropna()
    print('after droping the null valued columns')
    print(df.isna().sum())
    return df

In [223]:
df = clean_data(df)

before droping the null valued columns
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
target          0
dtype: int64
after droping the null valued columns
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
target          0
dtype: int64


In [224]:
def split_data(df: pd.DataFrame):
    from sklearn.model_selection import train_test_split

    X = df.loc[:, df.columns != 'target']
    y = df.loc[:, df.columns == 'target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    print('X train size:', X_train.shape)
    print('Y train size:', X_test.shape)
    print('X test size:', y_train.shape)
    print('Y test size:', y_test.shape)

    return X_train, X_test, y_train, y_test

In [225]:
X_train, X_test, y_train, y_test = split_data(df)

X train size: (105, 4)
Y train size: (45, 4)
X test size: (105, 1)
Y test size: (45, 1)


In [226]:
def basic_decision_tree_classifier(X_train, y_train):
    from sklearn.tree import DecisionTreeClassifier

    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    return model

In [227]:
model = basic_decision_tree_classifier(X_train, y_train)

In [228]:
def predict(model, X_test):
    y_predicted = model.predict(X_test)
    return y_predicted

In [229]:
def predict_probability(model, X_test):
    y_predicted_prob = model.predict_proba(X_test)
    return y_predicted_prob

In [242]:
def generate_metrics(y_predicted, y_test):
    from sklearn.metrics import accuracy_score, precision_score, recall_score

    # average param = [None, 'micro', 'macro', 'weighted'].

    acc = accuracy_score(y_test, y_predicted)
    prec = precision_score(y_test, y_predicted,average='macro')
    recall = recall_score(y_test, y_predicted, average='macro')
    return {'accuracy': round(acc, 2), 'precission': round(prec,2), 'recall': round(recall, 2)}

In [231]:
feature_importance_df = pd.DataFrame(model.feature_importances_)
feature_importance_df.head(10)

Unnamed: 0,0
0,0.021469
1,0.021469
2,0.571965
3,0.385096


In [247]:
y_predicted = predict(model, X_test)
y_predicted_prob = predict_probability(model, X_test)
result = generate_metrics(y_predicted, y_test)
print(result)

{'accuracy': 0.93, 'precission': 0.93, 'recall': 0.94}



## MLflow starts

In [249]:
experiment_name = "basic_decision_tree_classifier" #basic classifier
run_name="iris_flower_classification"
run_metrics = generate_metrics(y_predicted, y_test)
print(run_metrics)

{'accuracy': 0.93, 'precission': 0.93, 'recall': 0.94}


In [250]:
def start_experiment(experiment_name, run_name, run_metrics, model, run_params=None):
    import mlflow

    mlflow.set_experiment(experiment_name=experiment_name)
    with mlflow.start_run(run_name=run_name):
        if run_params != None:
            for param in run_params:
                mlflow.log_param(param, run_params[param])

        for metric in run_metrics:
            mlflow.log_metric(metric, run_metrics[metric])

        mlflow.sklearn.log_model(model, "model")
        mlflow.set_tag("tag1", "Basic Decision Tree Implementation")

    print('Run - %s is logged to Experiment - %s' %(run_name, experiment_name))

In [251]:
start_experiment(experiment_name, run_name, run_metrics, model)

Run - iris_flower_classification is logged to Experiment - basic_decision_tree_classifier


In [252]:
param_dict = {
    'criterion': ["gini", "entropy", "log_loss"],
    'splitter': ["best", "random"],
    'max_depth': [None, 5, 10, 15, 20, 35, 30],
    'class_weight': [None, "balanced"]
}

def optimised_decision_tree_classifier(X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    from sklearn.tree import DecisionTreeClassifier

    clf = DecisionTreeClassifier()
    gridsearch_cv = GridSearchCV(clf, param_grid=param_dict, cv=10, n_jobs=5)
    model = gridsearch_cv.fit(X_train, y_train)
    return gridsearch_cv, model

In [253]:
result, model = optimised_decision_tree_classifier(X_train, y_train)
print(result.best_estimator_)
print(result.best_score_)
print(result.best_params_)

DecisionTreeClassifier()
0.9609090909090909
{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'splitter': 'best'}


In [255]:
experiment_name = "optimised_decision_tree_classifier" #basic classifier
run_name="iris_flower_classification"
run_metrics = generate_metrics(y_predicted, y_test)
print(run_metrics)

{'accuracy': 0.93, 'precission': 0.93, 'recall': 0.94}


In [256]:
y_predicted = predict(model, X_test)
y_predicted_prob = predict_probability(model, X_test)

In [257]:
metrics = generate_metrics(y_predicted, y_test)
print(metrics)

{'accuracy': 0.96, 'precission': 0.96, 'recall': 0.96}


In [258]:
start_experiment(experiment_name=experiment_name, run_name=run_name, run_metrics=metrics, model=model, run_params=param_dict)

2023/04/07 05:54:29 INFO mlflow.tracking.fluent: Experiment with name 'optimised_decision_tree_classifier' does not exist. Creating a new experiment.


Run - iris_flower_classification is logged to Experiment - optimised_decision_tree_classifier
