In [1]:
import joblib
import mlflow
import dagshub
from dotenv import load_dotenv
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score



  import pkg_resources  # noqa: TID251
* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
# Load saved train/test splits
X_train = joblib.load("data/X_train.pkl")
X_test  = joblib.load("data/X_test.pkl")
y_train = joblib.load("data/y_train.pkl")
y_test  = joblib.load("data/y_test.pkl")

In [3]:


load_dotenv()
# Environment variables
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
repo_owner = os.getenv("DAGSHUB_REPO_OWNER")
repo_name = os.getenv("DAGSHUB_REPO_NAME")
experiment_name = os.getenv("MLFLOW_EXPERIMENT_2_NAME")  # New experiment

# setup mlflow
mlflow.set_tracking_uri(tracking_uri)
dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
mlflow.set_experiment(experiment_name)


<Experiment: artifact_location='mlflow-artifacts:/87db2281858c419a84627e673e6374da', creation_time=1760279343877, experiment_id='3', last_update_time=1760279343877, lifecycle_stage='active', name='experiment_2', tags={}>

In [4]:

def train_and_log_rf(X_train, X_test, y_train, y_test):
    """Trains a Random Forest with GridSearch and logs each hyperparameter combination as nested runs in MLflow."""
    
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    # Parent run for the whole experiment
    with mlflow.start_run(run_name="RandomForest_Hyperparameter_Tuning") as parent_run:
        
        grid_search = GridSearchCV(RandomForestClassifier(random_state=42), 
                                   param_grid, cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Nested runs for each combination
        for params, mean_score, std_score in zip(grid_search.cv_results_["params"],
                                                 grid_search.cv_results_["mean_test_score"],
                                                 grid_search.cv_results_["std_test_score"]):
            with mlflow.start_run(run_name=f"RF params: {params}", nested=True):
                model = RandomForestClassifier(**params, random_state=42)
                model.fit(X_train, y_train)
                
                y_pred = model.predict(X_test)
                y_prob = model.predict_proba(X_test)[:,1]
                
                metrics = {
                    "accuracy": accuracy_score(y_test, y_pred),
                    "precision": precision_score(y_test, y_pred),
                    "recall": recall_score(y_test, y_pred),
                    "f1_score": f1_score(y_test, y_pred),
                    "roc_auc": roc_auc_score(y_test, y_prob),
                    "mean_cv_score": mean_score,
                    "std_cv_score": std_score
                }
                
                # Log params and metrics
                mlflow.log_params(params)
                mlflow.log_metrics(metrics)
                
                print(f"Params: {params} | Accuracy: {metrics['accuracy']:.4f} | F1: {metrics['f1_score']:.4f} | ROC AUC: {metrics['roc_auc']:.4f}")
        
        # Log the best model
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        best_f1 = grid_search.best_score_
        
        mlflow.log_params(best_params)
        mlflow.log_metric("best_f1_score", best_f1)
        mlflow.sklearn.log_model(best_model, artifact_path="model")
        
        print(f"\nBest Params: {best_params} | Best F1 Score: {best_f1:.4f}")


train_and_log_rf(X_train, X_test, y_train, y_test)


Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100} | Accuracy: 0.9802 | F1: 0.9811 | ROC AUC: 0.9987
Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200} | Accuracy: 0.9821 | F1: 0.9829 | ROC AUC: 0.9988
Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300} | Accuracy: 0.9812 | F1: 0.9820 | ROC AUC: 0.9988
Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100} | Accuracy: 0.9793 | F1: 0.9803 | ROC AUC: 0.9984
Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200} | Accuracy: 0.9784 | F1: 0.9793 | ROC AUC: 0.9984
Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300} | Accuracy: 0.9812 | F1: 0.9820 | ROC AUC: 0.9984
Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100} | Accuracy: 0.9793 | F1: 0.9

In [None]:
def train_final_rf(X_train, X_test, y_train, y_test, best_params):
    """Trains final Random Forest model using best hyperparameters and logs it as Experiment 2 in MLflow."""
    
    with mlflow.start_run(run_name="Experiment_2_Final_RF_Best_Model"):
        # Initialize model with the best parameters
        model = RandomForestClassifier(**best_params, random_state=42)
        model.fit(X_train, y_train)
        
        # Predictions
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        
        # Metrics
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred),
            "roc_auc": roc_auc_score(y_test, y_prob)
        }
        
        # Log model, parameters, and metrics
        mlflow.log_params(best_params)
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(model, artifact_path="final_rf_model")
        
        print("\n Final Random Forest model trained and logged to MLflow.")
        for key, val in metrics.items():
            print(f"{key.capitalize()}: {val:.4f}")

        return model

best_params = {
    'n_estimators': 100,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1
}

final_model = train_final_rf(X_train, X_test, y_train, y_test, best_params)





✅ Final Random Forest model trained and logged to MLflow.
Accuracy: 0.9802
Precision: 0.9909
Recall: 0.9715
F1_score: 0.9811
Roc_auc: 0.9987
