In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import mlflow
from mlflow.models import infer_signature

In [None]:
DATASET_URI = "data.csv"
data = pd.read_csv(DATASET_URI)
data.head()

In [None]:
train, test = train_test_split(data, test_size = 0.4, stratify = data['species'], random_state = 42)
X_train = train[['sepal_length','sepal_width','petal_length','petal_width']]
y_train = train.species
X_test = test[['sepal_length','sepal_width','petal_length','petal_width']]
y_test = test.species

In [None]:
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
MLFLOW_EXPERIMENT_NAME = "Iris Species Classification"
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

In [None]:
def execute_training_pipeline(hyperparams, X_train, y_train, X_test, y_test):
    """
    Execute the training pipeline: train and evaluate the model. Log parameters, metrics, and model to MLflow.
    Args:
        hyperparams (dict): Hyperparameters for the DecisionTreeClassifier.
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training labels.
        X_test (pd.DataFrame): Testing features.
        y_test (pd.Series): Testing labels.
    Returns:
        None
    """

    # Train model
    model = DecisionTreeClassifier(**hyperparams)
    model.fit(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    metrics_dict = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }

    # Log parameters, metrics, and model to MLflow
    with mlflow.start_run():
        mlflow.log_params(hyperparams)
        mlflow.log_metrics(metrics_dict)
        mlflow.sklearn.log_model(
            sk_model=model,
            name="decision_tree_model",
            signature=infer_signature(X_train, model.predict(X_train)),
            registered_model_name="IrisDecisionTreeModel",
        )
    
    print("Training pipeline executed and logged to MLflow.")


In [None]:
hyperparams_v1 = {
    "criterion": "gini",
    "max_depth": 3,
    "min_samples_split": 3,
    "min_samples_leaf": 1
}
execute_training_pipeline(hyperparams_v1, X_train, y_train, X_test, y_test)

In [None]:
hyperparams_v2 = {
    "criterion": "gini",
    "max_depth": 5,
    "min_samples_split": 3,
    "min_samples_leaf": 1
}
execute_training_pipeline(hyperparams_v2, X_train, y_train, X_test, y_test)

In [None]:
hyperparams_v3 = {
    "criterion": "gini",
    "max_depth": 8,
    "min_samples_split": 3,
    "min_samples_leaf": 1
}
execute_training_pipeline(hyperparams_v3, X_train, y_train, X_test, y_test)