# MLflow Experiments

This notebook performs Multiple MLflow experiments with different model parameters

In [6]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from mlflow.models.signature import infer_signature
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri('http://localhost:5000')

data = pd.read_csv('artifacts/train.csv')

with open('artifacts/preprocessor.pkl', 'rb') as file:
    preprocessor = pickle.load(file)

X = data.drop(['Premium Amount', 'id'], axis=1)
y = data['Premium Amount']

X_transformed = preprocessor.transform(X)

def get_feature_names_safe(transformer, default_name, X_shape):
    try:
        return transformer.get_feature_names_out()
    except AttributeError:
        return np.array([f"{default_name}_{i}" for i in range(X_shape)])

num_features = get_feature_names_safe(
    preprocessor.named_transformers_.get('num_pipeline', None), "num", X_transformed.shape[1]
)
cat_features = get_feature_names_safe(
    preprocessor.named_transformers_.get('cat_pipeline', None), "cat", X_transformed.shape[1]
)

date_features = np.array([f"date_feature_{i}" for i in range(X_transformed.shape[1])])

feature_names = np.concatenate([num_features, cat_features, date_features])

if len(feature_names) != X_transformed.shape[1]:
    print("Warning: Feature names count does not match transformed data shape!")
    feature_names = [f"feature_{i}" for i in range(X_transformed.shape[1])]

X_transformed = pd.DataFrame(X_transformed, columns=feature_names)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test



(     feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
 264  -0.891845  -0.633279   0.009382   0.632983   1.353397   1.125760   
 615   1.321608   1.697909   1.508627  -0.573539   0.070640   0.446393   
 329   1.542953   0.393913  -1.489863   1.215206  -1.212117  -0.742500   
 342   0.583790  -0.475596   0.009382  -0.159911   1.353397  -0.742500   
 394  -1.482099  -0.654751   1.508627   1.539863   0.070640  -0.063133   
 ..         ...        ...        ...        ...        ...        ...   
 71    0.436227   1.374819   0.009382  -0.088712   0.070640  -1.591708   
 106  -1.334536   1.850926   1.508627  -1.137323   0.070640  -1.421867   
 270   0.288663   2.254450   0.009382   1.223793   0.070640   0.616234   
 435   0.141100  -0.940226   0.009382  -0.123851   0.070640   1.125760   
 102   1.247826  -0.887129  -0.740241  -0.303836   0.070640  -1.421867   
 
      feature_6  feature_7  feature_8  feature_9  feature_10  feature_11  \
 264  -0.163619   0.011327  -0.998

### Linear Models Comparison

In [7]:
try:
    mlflow.end_run()
except Exception as e:
    print(f"Note: No active run to end or {str(e)}")

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

client = mlflow.tracking.MlflowClient()

try:
    experiment = client.get_experiment_by_name("insurance_linear_models")
    if experiment:
        if experiment.lifecycle_stage == "deleted":
            client.restore_experiment(experiment.experiment_id)
            print(f"Restored deleted experiment: 'insurance_linear_models'")
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: 'insurance_linear_models' (ID: {experiment_id})")
    else:
        experiment_id = mlflow.create_experiment("insurance_linear_models")
        print(f"Created new experiment: 'insurance_linear_models' (ID: {experiment_id})")
except Exception as e:
    print(f"Error accessing experiment: {e}")
    experiment_id = mlflow.create_experiment("insurance_linear_models_new")
    print(f"Created new experiment with unique name: 'insurance_linear_models_new' (ID: {experiment_id})")

mlflow.set_experiment("insurance_linear_models")

linear_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression (alpha=0.1)": Ridge(alpha=0.1),
    "Ridge Regression (alpha=1.0)": Ridge(alpha=1.0), 
    "Ridge Regression (alpha=10.0)": Ridge(alpha=10.0),
    "Lasso Regression (alpha=0.1)": Lasso(alpha=0.1),
    "Lasso Regression (alpha=1.0)": Lasso(alpha=1.0)
}

results = []

for model_name, model in linear_models.items():
    with mlflow.start_run(run_name=model_name):
        params = model.get_params()
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        rmse, mae, r2 = eval_metrics(y_test, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        X_test_float = X_test.astype('float64')
        y_test_float = y_test.astype('float64')
        signature = infer_signature(X_test_float, y_test_float)
        
        mlflow.sklearn.log_model(model, "model", signature=signature)
        
        results.append({"Model": model_name, "RMSE": rmse, "MAE": mae, "R²": r2})
        
        print(f"Model: {model_name}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

results_df = pd.DataFrame(results).sort_values("R²", ascending=False)
results_df

Using existing experiment: 'insurance_linear_models' (ID: 1)
Model: Linear Regression, RMSE: 935.72, MAE: 702.11, R²: -0.0156
🏃 View run Linear Regression at: http://localhost:5000/#/experiments/1/runs/68b50ddfbc324316b76d413cd894cdee
🧪 View experiment at: http://localhost:5000/#/experiments/1
Model: Ridge Regression (alpha=0.1), RMSE: 935.72, MAE: 702.11, R²: -0.0156
🏃 View run Ridge Regression (alpha=0.1) at: http://localhost:5000/#/experiments/1/runs/e5287c9e33ad4841b91d6635200ab21a
🧪 View experiment at: http://localhost:5000/#/experiments/1
Model: Ridge Regression (alpha=1.0), RMSE: 935.69, MAE: 702.08, R²: -0.0155
🏃 View run Ridge Regression (alpha=1.0) at: http://localhost:5000/#/experiments/1/runs/8d674183ac4f4a58a981e96ac4b69ae8
🧪 View experiment at: http://localhost:5000/#/experiments/1
Model: Ridge Regression (alpha=10.0), RMSE: 935.44, MAE: 701.86, R²: -0.0150
🏃 View run Ridge Regression (alpha=10.0) at: http://localhost:5000/#/experiments/1/runs/e217ddf85cc441408409d35e90ca

Unnamed: 0,Model,RMSE,MAE,R²
3,Ridge Regression (alpha=10.0),935.441987,701.861689,-0.015001
5,Lasso Regression (alpha=1.0),935.560765,702.072878,-0.015259
2,Ridge Regression (alpha=1.0),935.690483,702.084945,-0.015541
4,Lasso Regression (alpha=0.1),935.70208,702.106461,-0.015566
1,Ridge Regression (alpha=0.1),935.715974,702.107674,-0.015596
0,Linear Regression,935.718814,702.110204,-0.015602


### Tree-based Models with Hyperparameter Tuning

In [8]:
mlflow.end_run()

client = mlflow.tracking.MlflowClient()

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

try:
    experiment = client.get_experiment_by_name("insurance_tree_models")
    if experiment:
        if experiment.lifecycle_stage == "deleted":
            client.restore_experiment(experiment.experiment_id)
            print(f"Restored deleted experiment: 'insurance_tree_models'")
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: 'insurance_tree_models' (ID: {experiment_id})")
    else:
        experiment_id = mlflow.create_experiment("insurance_tree_models")
        print(f"Created new experiment: 'insurance_tree_models' (ID: {experiment_id})")
except Exception as e:
    print(f"Error accessing experiment: {e}")
    experiment_id = mlflow.create_experiment("insurance_tree_models_new")
    print(f"Created new experiment with unique name: 'insurance_tree_models_new' (ID: {experiment_id})")

mlflow.set_experiment("insurance_tree_models")

tree_models = {
    "Random Forest (10 trees, max_depth=5)": RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42),
    "Random Forest (50 trees, max_depth=10)": RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42),
    "Random Forest (100 trees, max_depth=20)": RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42),
    "Gradient Boosting (50 trees, lr=0.05)": GradientBoostingRegressor(n_estimators=50, learning_rate=0.05, random_state=42),
    "Gradient Boosting (100 trees, lr=0.1)": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "Gradient Boosting (200 trees, lr=0.01)": GradientBoostingRegressor(n_estimators=200, learning_rate=0.01, random_state=42)
}

results = []

for model_name, model in tree_models.items():
    with mlflow.start_run(run_name=model_name):
        params = model.get_params()
        for param_name, param_value in params.items():
            if param_name in ['n_estimators', 'max_depth', 'learning_rate', 'min_samples_split']:
                mlflow.log_param(param_name, param_value)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        rmse, mae, r2 = eval_metrics(y_test, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        X_test_float = X_test.astype('float64')
        y_test_float = y_test.astype('float64')
        signature = infer_signature(X_test_float, y_test_float)
        
        mlflow.sklearn.log_model(model, "model", signature=signature)
        
        results.append({"Model": model_name, "RMSE": rmse, "MAE": mae, "R²": r2})
        
        print(f"Model: {model_name}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

results_df = pd.DataFrame(results).sort_values("R²", ascending=False)
results_df

Created new experiment: 'insurance_tree_models' (ID: 2)
Model: Random Forest (10 trees, max_depth=5), RMSE: 942.80, MAE: 707.99, R²: -0.0310
🏃 View run Random Forest (10 trees, max_depth=5) at: http://localhost:5000/#/experiments/2/runs/0861b1aea1e04e8ea7f4b8a58ff4063c
🧪 View experiment at: http://localhost:5000/#/experiments/2
Model: Random Forest (50 trees, max_depth=10), RMSE: 957.54, MAE: 749.96, R²: -0.0635
🏃 View run Random Forest (50 trees, max_depth=10) at: http://localhost:5000/#/experiments/2/runs/5a462c0792b847209b83a79a85775d13
🧪 View experiment at: http://localhost:5000/#/experiments/2
Model: Random Forest (100 trees, max_depth=20), RMSE: 967.53, MAE: 757.57, R²: -0.0858
🏃 View run Random Forest (100 trees, max_depth=20) at: http://localhost:5000/#/experiments/2/runs/a4817c4bf6fd49f7b426325e77fe2725
🧪 View experiment at: http://localhost:5000/#/experiments/2
Model: Gradient Boosting (50 trees, lr=0.05), RMSE: 935.33, MAE: 708.53, R²: -0.0148
🏃 View run Gradient Boosting (5

Unnamed: 0,Model,RMSE,MAE,R²
5,"Gradient Boosting (200 trees, lr=0.01)",931.263453,705.229572,-0.005954
3,"Gradient Boosting (50 trees, lr=0.05)",935.33378,708.531631,-0.014767
0,"Random Forest (10 trees, max_depth=5)",942.798639,707.987078,-0.031029
1,"Random Forest (50 trees, max_depth=10)",957.538607,749.96068,-0.06352
4,"Gradient Boosting (100 trees, lr=0.1)",963.789106,739.318214,-0.07745
2,"Random Forest (100 trees, max_depth=20)",967.529839,757.574956,-0.08583


### XGBoost with Advanced Parameters

In [9]:
mlflow.end_run()

os.environ["MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING"] = "false"

client = mlflow.tracking.MlflowClient()

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

try:
    experiment = client.get_experiment_by_name("insurance_xgboost_models")
    if experiment:
        if experiment.lifecycle_stage == "deleted":
            client.restore_experiment(experiment.experiment_id)
            print(f"Restored deleted experiment: 'insurance_xgboost_models'")
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: 'insurance_xgboost_models' (ID: {experiment_id})")
    else:
        experiment_id = mlflow.create_experiment("insurance_xgboost_models")
        print(f"Created new experiment: 'insurance_xgboost_models' (ID: {experiment_id})")
except Exception as e:
    print(f"Error accessing experiment: {e}")
    experiment_id = mlflow.create_experiment("insurance_xgboost_models_new")
    print(f"Created new experiment with unique name: 'insurance_xgboost_models_new' (ID: {experiment_id})")

mlflow.set_experiment("insurance_xgboost_models")

xgb_models = {
    "XGBoost (Basic)": XGBRegressor(n_estimators=100, random_state=42),
    "XGBoost (Max Depth=3, LR=0.1)": XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42),
    "XGBoost (Max Depth=5, LR=0.05)": XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.05, random_state=42),
    "XGBoost (Max Depth=7, LR=0.01)": XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.01, random_state=42),
    "XGBoost (200 estimators, subsample=0.8)": XGBRegressor(
        n_estimators=200, max_depth=5, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=42),
    "XGBoost (Regularized l1=1, l2=1)": XGBRegressor(
        n_estimators=100, max_depth=5, learning_rate=0.05, reg_alpha=1, reg_lambda=1, random_state=42)
}

results = []

for model_name, model in xgb_models.items():
    with mlflow.start_run(run_name=model_name):
        params = model.get_params()
        for param_name, param_value in params.items():
            if param_name in ['n_estimators', 'max_depth', 'learning_rate', 'subsample', 'colsample_bytree', 'reg_alpha', 'reg_lambda']:
                mlflow.log_param(param_name, param_value)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        rmse, mae, r2 = eval_metrics(y_test, y_pred)
        
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        
        results.append({"Model": model_name, "RMSE": rmse, "MAE": mae, "R²": r2})
        
        print(f"Model: {model_name}, RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

best_model = max(results, key=lambda x: x["R²"])
best_model_name = best_model["Model"]
best_model_instance = xgb_models[best_model_name]

X_test_float = X_test.astype('float64')
y_test_float = y_test.astype('float64')
signature = infer_signature(X_test_float, y_test_float)

input_example = X_test.iloc[:5]

mlflow.sklearn.log_model(best_model_instance, "best_model", signature=signature, input_example=input_example)

results_df = pd.DataFrame(results).sort_values("R²", ascending=False)
print(results_df)

Created new experiment: 'insurance_xgboost_models' (ID: 3)
Model: XGBoost (Basic), RMSE: 977.17, MAE: 751.40, R²: -0.1076
🏃 View run XGBoost (Basic) at: http://localhost:5000/#/experiments/3/runs/f2a26999299e47aea9281d9c7d1fd313
🧪 View experiment at: http://localhost:5000/#/experiments/3
Model: XGBoost (Max Depth=3, LR=0.1), RMSE: 960.73, MAE: 739.61, R²: -0.0706
🏃 View run XGBoost (Max Depth=3, LR=0.1) at: http://localhost:5000/#/experiments/3/runs/64076d104a6e4f1593d4c0688098d6a0
🧪 View experiment at: http://localhost:5000/#/experiments/3
Model: XGBoost (Max Depth=5, LR=0.05), RMSE: 949.63, MAE: 732.86, R²: -0.0460
🏃 View run XGBoost (Max Depth=5, LR=0.05) at: http://localhost:5000/#/experiments/3/runs/9aba7ffa5eda45a3ab0d227f7833911a
🧪 View experiment at: http://localhost:5000/#/experiments/3
Model: XGBoost (Max Depth=7, LR=0.01), RMSE: 937.34, MAE: 714.76, R²: -0.0191
🏃 View run XGBoost (Max Depth=7, LR=0.01) at: http://localhost:5000/#/experiments/3/runs/17d322a73c2e4e868c2ca26e39

## Summary of Experiments

We've run three different MLflow experiments:

1. **Linear Models Comparison** - Compared various linear regression models with different regularization parameters
2. **Tree-based Models with Hyperparameter Tuning** - Explored Random Forest and Gradient Boosting with different hyperparameters
3. **XGBoost with Advanced Parameters** - Fine-tuned XGBoost models with various advanced settings

You can view all these experiments in the MLflow UI by navigating to http://localhost:5000.

## Best Model Selection

Let's retrieve the best performing model from MLflow and use it for our insurance premium prediction:

In [10]:
client = mlflow.tracking.MlflowClient()

best_models = []

for experiment_name in ["insurance_linear_models", "insurance_tree_models", "insurance_xgboost_models"]:
    experiment = client.get_experiment_by_name(experiment_name)
    if experiment:
        runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            order_by=["metrics.r2 DESC"],
            max_results=1
        )
        
        if runs:
            best_run = runs[0]
            best_models.append({
                "Experiment": experiment_name,
                "Run ID": best_run.info.run_id,
                "Model": best_run.data.tags.get("mlflow.runName", "Unknown"),
                "R²": best_run.data.metrics.get("r2", 0),
                "RMSE": best_run.data.metrics.get("rmse", 0),
                "MAE": best_run.data.metrics.get("mae", 0)
            })

pd.DataFrame(best_models).sort_values("R²", ascending=False)

Unnamed: 0,Experiment,Run ID,Model,R²,RMSE,MAE
1,insurance_tree_models,b3d406953cb84e85919d2764077f4811,"Gradient Boosting (200 trees, lr=0.01)",-0.005954,931.263453,705.229572
0,insurance_linear_models,e217ddf85cc441408409d35e90caedec,Ridge Regression (alpha=10.0),-0.015001,935.441987,701.861689
2,insurance_xgboost_models,17d322a73c2e4e868c2ca26e3914ea76,"XGBoost (Max Depth=7, LR=0.01)",-0.019129,937.342153,714.760437


## Final Model Loading

You can load the best model directly from MLflow for production use:

In [11]:
best_run_id = "4312aea0a8f34cadb3ed3bf898eeb6bd"

try:
    best_model = mlflow.sklearn.load_model(f"runs:/{best_run_id}/model")
    print(f"Successfully loaded model from run ID: {best_run_id}")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Please replace the run ID with an actual run ID from the table above.")

Error loading model: RESOURCE_DOES_NOT_EXIST: Run with id=4312aea0a8f34cadb3ed3bf898eeb6bd not found
Please replace the run ID with an actual run ID from the table above.
