# MLflow Experiments

This notebook performs Multiple MLflow experiments with different model parameters

In [64]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from mlflow.models.signature import infer_signature
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri('http://localhost:5000')

data = pd.read_csv('artifacts/train.csv')

with open('artifacts/preprocessor.pkl', 'rb') as file:
    preprocessor = pickle.load(file)

def prepare_data(data):
    X = data.drop(['Premium Amount', 'id'], axis=1)
    y = data['Premium Amount']

    X_transformed = preprocessor.transform(X)

    def get_feature_names_safe(transformer, default_name, X_shape):
        try:
            return transformer.get_feature_names_out()
        except AttributeError:
            return np.array([f"{default_name}_{i}" for i in range(X_shape)])

    num_features = get_feature_names_safe(
        preprocessor.named_transformers_.get('num_pipeline', None), "num", X_transformed.shape[1]
    )
    cat_features = get_feature_names_safe(
        preprocessor.named_transformers_.get('cat_pipeline', None), "cat", X_transformed.shape[1]
    )

    date_features = np.array([f"date_feature_{i}" for i in range(X_transformed.shape[1])])

    feature_names = np.concatenate([num_features, cat_features, date_features])

    if len(feature_names) != X_transformed.shape[1]:
        print("Warning: Feature names count does not match transformed data shape!")
        feature_names = [f"feature_{i}" for i in range(X_transformed.shape[1])]

    X_transformed = pd.DataFrame(X_transformed, columns=feature_names)

    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

### Linear Models Comparison

In [65]:
try:
    mlflow.end_run()
except Exception as e:
    print(f"Note: No active run to end or {str(e)}")

client = mlflow.tracking.MlflowClient()

try:
    experiment = client.get_experiment_by_name("insurance_linear_models")
    if experiment:
        if experiment.lifecycle_stage == "deleted":
            client.restore_experiment(experiment.experiment_id)
            print(f"Restored deleted experiment: 'insurance_linear_models'")
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: 'insurance_linear_models' (ID: {experiment_id})")
    else:
        experiment_id = mlflow.create_experiment("insurance_linear_models")
        print(f"Created new experiment: 'insurance_linear_models' (ID: {experiment_id})")
except Exception as e:
    print(f"Error accessing experiment: {e}")
    experiment_id = mlflow.create_experiment("insurance_linear_models_new")
    print(f"Created new experiment with unique name: 'insurance_linear_models_new' (ID: {experiment_id})")

mlflow.set_experiment("insurance_linear_models")

linear_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression (alpha=0.1)": Ridge(alpha=0.1),
    "Ridge Regression (alpha=1.0)": Ridge(alpha=1.0), 
    "Ridge Regression (alpha=10.0)": Ridge(alpha=10.0),
    "Lasso Regression (alpha=0.1)": Lasso(alpha=0.1),
    "Lasso Regression (alpha=1.0)": Lasso(alpha=1.0)
}

results = []

for model_name, model in linear_models.items():
    with mlflow.start_run(run_name=model_name):
        params = model.get_params()
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        rmse, mae, r2 = eval_metrics(y_test, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        X_test_float = X_test.astype('float64')
        y_test_float = y_test.astype('float64')
        signature = infer_signature(X_test_float, y_test_float)
        
        mlflow.sklearn.log_model(model, "model", signature=signature)
        
        results.append({"Model": model_name, "RMSE": rmse, "MAE": mae, "R²": r2})
        
        print(f"Model: {model_name}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

results_df = pd.DataFrame(results).sort_values("R²", ascending=False)
results_df

Note: No active run to end or RESOURCE_DOES_NOT_EXIST: Run with id=6de1aee0f9f6451ba8fd06a7a16f5509 not found
Created new experiment: 'insurance_linear_models' (ID: 1)
Model: Linear Regression, RMSE: 802.25, MAE: 620.35, R²: -0.0071
🏃 View run Linear Regression at: http://localhost:5000/#/experiments/1/runs/de7c3f378e864aef96bf3275df8e0a9c
🧪 View experiment at: http://localhost:5000/#/experiments/1
Model: Ridge Regression (alpha=0.1), RMSE: 802.24, MAE: 620.34, R²: -0.0071
🏃 View run Ridge Regression (alpha=0.1) at: http://localhost:5000/#/experiments/1/runs/1e7fb26c8c2248b3b8448f17c4f277e7
🧪 View experiment at: http://localhost:5000/#/experiments/1
Model: Ridge Regression (alpha=1.0), RMSE: 802.21, MAE: 620.32, R²: -0.0070
🏃 View run Ridge Regression (alpha=1.0) at: http://localhost:5000/#/experiments/1/runs/2cdc421e3ac4492c872a21887022a322
🧪 View experiment at: http://localhost:5000/#/experiments/1
Model: Ridge Regression (alpha=10.0), RMSE: 801.87, MAE: 620.07, R²: -0.0062
🏃 View ru

Unnamed: 0,Model,RMSE,MAE,R²
5,Lasso Regression (alpha=1.0),801.712798,620.272136,-0.005806
3,Ridge Regression (alpha=10.0),801.872637,620.067844,-0.006207
4,Lasso Regression (alpha=0.1),802.190518,620.339497,-0.007005
2,Ridge Regression (alpha=1.0),802.20665,620.318569,-0.007046
1,Ridge Regression (alpha=0.1),802.241206,620.344102,-0.007132
0,Linear Regression,802.245059,620.346944,-0.007142


### Tree-based Models with Hyperparameter Tuning

In [66]:
mlflow.end_run()

client = mlflow.tracking.MlflowClient()

try:
    experiment = client.get_experiment_by_name("insurance_tree_models")
    if experiment:
        if experiment.lifecycle_stage == "deleted":
            client.restore_experiment(experiment.experiment_id)
            print(f"Restored deleted experiment: 'insurance_tree_models'")
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: 'insurance_tree_models' (ID: {experiment_id})")
    else:
        experiment_id = mlflow.create_experiment("insurance_tree_models")
        print(f"Created new experiment: 'insurance_tree_models' (ID: {experiment_id})")
except Exception as e:
    print(f"Error accessing experiment: {e}")
    experiment_id = mlflow.create_experiment("insurance_tree_models_new")
    print(f"Created new experiment with unique name: 'insurance_tree_models_new' (ID: {experiment_id})")

mlflow.set_experiment("insurance_tree_models")

tree_models = {
    "Random Forest (10 trees, max_depth=5)": RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42),
    "Random Forest (50 trees, max_depth=10)": RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
    "Random Forest (100 trees, max_depth=20)": RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42),
    "Gradient Boosting (50 trees, lr=0.05)": GradientBoostingRegressor(n_estimators=50, learning_rate=0.05, random_state=42),
    "Gradient Boosting (100 trees, lr=0.1)": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "Gradient Boosting (200 trees, lr=0.01)": GradientBoostingRegressor(n_estimators=200, learning_rate=0.01, random_state=42)
}

results = []

for model_name, model in tree_models.items():
    with mlflow.start_run(run_name=model_name):
        params = model.get_params()
        for param_name, param_value in params.items():
            if param_name in ['n_estimators', 'max_depth', 'learning_rate', 'min_samples_split']:
                mlflow.log_param(param_name, param_value)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        rmse, mae, r2 = eval_metrics(y_test, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        X_test_float = X_test.astype('float64')
        y_test_float = y_test.astype('float64')
        signature = infer_signature(X_test_float, y_test_float)
        
        mlflow.sklearn.log_model(model, "model", signature=signature)
        
        results.append({"Model": model_name, "RMSE": rmse, "MAE": mae, "R²": r2})
        
        print(f"Model: {model_name}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

results_df = pd.DataFrame(results).sort_values("R²", ascending=False)
results_df

Created new experiment: 'insurance_tree_models' (ID: 2)
Model: Random Forest (10 trees, max_depth=5), RMSE: 835.53, MAE: 636.95, R²: -0.0924
🏃 View run Random Forest (10 trees, max_depth=5) at: http://localhost:5000/#/experiments/2/runs/3f3db39302684c6cb63985e5340c23e4
🧪 View experiment at: http://localhost:5000/#/experiments/2
Model: Random Forest (50 trees, max_depth=10), RMSE: 803.43, MAE: 623.70, R²: -0.0101
🏃 View run Random Forest (50 trees, max_depth=10) at: http://localhost:5000/#/experiments/2/runs/1922a197886a4093834f5a54abeb7b3c
🧪 View experiment at: http://localhost:5000/#/experiments/2
Model: Random Forest (100 trees, max_depth=20), RMSE: 815.94, MAE: 632.34, R²: -0.0418
🏃 View run Random Forest (100 trees, max_depth=20) at: http://localhost:5000/#/experiments/2/runs/9aeaefb7d57c4dbea628b55b5ee9bc32
🧪 View experiment at: http://localhost:5000/#/experiments/2
Model: Gradient Boosting (50 trees, lr=0.05), RMSE: 798.86, MAE: 610.31, R²: 0.0013
🏃 View run Gradient Boosting (50

Unnamed: 0,Model,RMSE,MAE,R²
5,"Gradient Boosting (200 trees, lr=0.01)",797.945814,608.906417,0.003624
3,"Gradient Boosting (50 trees, lr=0.05)",798.861483,610.30903,0.001336
1,"Random Forest (50 trees, max_depth=10)",803.432741,623.701871,-0.010126
2,"Random Forest (100 trees, max_depth=20)",815.937842,632.335072,-0.041815
4,"Gradient Boosting (100 trees, lr=0.1)",822.136561,637.916469,-0.057705
0,"Random Forest (10 trees, max_depth=5)",835.525944,636.95479,-0.092437


### XGBoost with Advanced Parameters

In [67]:
mlflow.end_run()

os.environ["MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING"] = "false"

client = mlflow.tracking.MlflowClient()

try:
    experiment = client.get_experiment_by_name("insurance_xgboost_models")
    if experiment:
        if experiment.lifecycle_stage == "deleted":
            client.restore_experiment(experiment.experiment_id)
            print(f"Restored deleted experiment: 'insurance_xgboost_models'")
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: 'insurance_xgboost_models' (ID: {experiment_id})")
    else:
        experiment_id = mlflow.create_experiment("insurance_xgboost_models")
        print(f"Created new experiment: 'insurance_xgboost_models' (ID: {experiment_id})")
except Exception as e:
    print(f"Error accessing experiment: {e}")
    experiment_id = mlflow.create_experiment("insurance_xgboost_models_new")
    print(f"Created new experiment with unique name: 'insurance_xgboost_models_new' (ID: {experiment_id})")

mlflow.set_experiment("insurance_xgboost_models")

xgb_models = {
    "XGBoost (Basic)": XGBRegressor(n_estimators=100, random_state=42),
    "XGBoost (Max Depth=3, LR=0.1)": XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42),
    "XGBoost (Max Depth=5, LR=0.05)": XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.05, random_state=42),
    "XGBoost (Max Depth=7, LR=0.01)": XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.01, random_state=42),
    "XGBoost (200 estimators, subsample=0.8)": XGBRegressor(
        n_estimators=200, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42),
    "XGBoost (Regularized l1=1, l2=1)": XGBRegressor(
        n_estimators=100, max_depth=5, learning_rate=0.05, reg_alpha=1, reg_lambda=1, random_state=42)
}

results = []

for model_name, model in xgb_models.items():
    with mlflow.start_run(run_name=model_name):
        params = model.get_params()
        for param_name, param_value in params.items():
            if param_name in ['n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'reg_alpha', 'reg_lambda']:
                mlflow.log_param(param_name, param_value)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        rmse, mae, r2 = eval_metrics(y_test, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        results.append({"Model": model_name, "RMSE": rmse, "MAE": mae, "R²": r2})
        
        print(f"Model: {model_name}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

best_model = max(results, key=lambda x: x["R²"])
best_model_name = best_model["Model"]
best_model_instance = xgb_models[best_model_name]

X_test_float = X_test.astype('float64')
y_test_float = y_test.astype('float64')
signature = infer_signature(X_test_float, y_test_float)

input_example = X_test.iloc[:5]

mlflow.sklearn.log_model(best_model_instance, "best_model", signature=signature, input_example=input_example)

results_df = pd.DataFrame(results).sort_values("R²", ascending=False)
print(results_df)

Created new experiment: 'insurance_xgboost_models' (ID: 3)
Model: XGBoost (Basic), RMSE: 899.64, MAE: 722.11, R²: -0.2665
🏃 View run XGBoost (Basic) at: http://localhost:5000/#/experiments/3/runs/b707bc98f5d7424eae753dc9c5371871
🧪 View experiment at: http://localhost:5000/#/experiments/3
Model: XGBoost (Max Depth=3, LR=0.1), RMSE: 843.65, MAE: 656.62, R²: -0.1138
🏃 View run XGBoost (Max Depth=3, LR=0.1) at: http://localhost:5000/#/experiments/3/runs/fdd13c735e2d4a93ae35017ef172e1c2
🧪 View experiment at: http://localhost:5000/#/experiments/3
Model: XGBoost (Max Depth=5, LR=0.05), RMSE: 820.41, MAE: 626.31, R²: -0.0533
🏃 View run XGBoost (Max Depth=5, LR=0.05) at: http://localhost:5000/#/experiments/3/runs/9a449f7b4ce6422bb1d680d9733c1ceb
🧪 View experiment at: http://localhost:5000/#/experiments/3
Model: XGBoost (Max Depth=7, LR=0.01), RMSE: 817.69, MAE: 631.63, R²: -0.0463
🏃 View run XGBoost (Max Depth=7, LR=0.01) at: http://localhost:5000/#/experiments/3/runs/23d328514dc14bf9a970b897c9

## Summary of Experiments

We've run three different MLflow experiments:

1. **Linear Models Comparison** - Compared various linear regression models with different regularization parameters
2. **Tree-based Models with Hyperparameter Tuning** - Explored Random Forest and Gradient Boosting with different hyperparameters
3. **XGBoost with Advanced Parameters** - Fine-tuned XGBoost models with various advanced settings

You can view all these experiments in the MLflow UI by navigating to http://localhost:5000.

## Best Model Selection

Let's retrieve the best performing model from MLflow and use it for our insurance premium prediction:

In [68]:
client = mlflow.tracking.MlflowClient()

best_models = []

for experiment_name in ["insurance_linear_models", "insurance_tree_models", "insurance_xgboost_models"]:
    experiment = client.get_experiment_by_name(experiment_name)
    if experiment:
        runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            order_by=["metrics.r2 DESC"],
            max_results=1
        )
        
        if runs:
            best_run = runs[0]
            best_models.append({
                "Experiment": experiment_name,
                "Run ID": best_run.info.run_id,
                "Model": best_run.data.tags.get("mlflow.runName", "Unknown"),
                "R²": best_run.data.metrics.get("r2", 0),
                "RMSE": best_run.data.metrics.get("rmse", 0),
                "MAE": best_run.data.metrics.get("mae", 0)
            })

pd.DataFrame(best_models).sort_values("R²", ascending=False)

Unnamed: 0,Experiment,Run ID,Model,R²,RMSE,MAE
1,insurance_tree_models,518eba59281048938664ea042cbe708c,"Gradient Boosting (200 trees, lr=0.01)",0.003624,797.945814,608.906417
0,insurance_linear_models,2336007bf8ee4adcb0912e1372ac8389,Lasso Regression (alpha=1.0),-0.005806,801.712798,620.272136
2,insurance_xgboost_models,cce749a8768a43089cf7bfcebf102e9a,"XGBoost (Regularized l1=1, l2=1)",-0.045821,817.505046,628.956909


## Final Model Loading

You can load the best model directly from MLflow for production use:

In [69]:
best_run_id = "4312aea0a8f34cadb3ed3bf898eeb6bd"

try:
    best_model = mlflow.sklearn.load_model(f"runs:/{best_run_id}/model")
    print(f"Successfully loaded model from run ID: {best_run_id}")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please replace the run ID with an actual run ID from the table above.")

Error loading model: RESOURCE_DOES_NOT_EXIST: Run with id=4312aea0a8f34cadb3ed3bf898eeb6bd not found
Please replace the run ID with an actual run ID from the table above.
