In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
pd.set_option('display.max_columns', 500)

# Data

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rodolfomendes/abalone-dataset")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os

df = pd.read_csv(os.path.join(path, "abalone.csv"))
df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [7]:
from sklearn.model_selection import train_test_split

# train test split

train, test = train_test_split(df, test_size=0.25, random_state=1)
print("Train data points :", len(train))
print("Test data points :", len(test))

Train data points : 3132
Test data points : 1045


In [9]:
numerical_features = [
    "Length",
    "Diameter",
    "Height",
    "Whole weight",
    "Shucked weight",
    "Viscera weight",
    "Shell weight",
]

categorical_feature = "Sex"

features = numerical_features + [categorical_feature]

target = "Rings"

Label encoding

In [15]:
train.Sex = train.Sex.replace({"M": 1, "I": 0, "F": -1})
test.Sex = test.Sex.replace({"M": 1, "I": 0, "F": -1})

  train.Sex = train.Sex.replace({"M": 1, "I": 0, "F": -1})
  test.Sex = test.Sex.replace({"M": 1, "I": 0, "F": -1})


Removing outliers

In [16]:
idx = train.loc[train.Height > 0.4].index
train.drop(idx, inplace=True)

idx = train.loc[train["Viscera weight"] > 0.6].index
train.drop(idx, inplace=True)

idx = train.loc[train[target] > 25].index
train.drop(idx, inplace=True)

Feature separation

In [17]:
X_train = train[features]
y_train = train[target]

X_test = test[features]
y_test = test[target]

X_train.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex
4014,0.625,0.48,0.175,1.065,0.4865,0.259,0.285,1
3252,0.48,0.38,0.13,0.6175,0.3,0.142,0.175,1
305,0.2,0.145,0.06,0.037,0.0125,0.0095,0.011,0
1857,0.505,0.4,0.145,0.7045,0.334,0.1425,0.207,0
439,0.5,0.415,0.165,0.6885,0.249,0.138,0.25,1


In [34]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

models = {
    "linear_regression": LinearRegression(),
    "lasso": Lasso(random_state=1),
    "decision_tree": DecisionTreeRegressor(random_state=1),
    "random_forest": RandomForestRegressor(random_state=1),
    "xgboost": XGBRegressor(random_state=1),
}

Hyperparameter tuning

In [35]:
# Linear regression
lr_params = {"fit_intercept": [True, False]}

# Lasso
lasso_params = {"alpha": [1e-4, 1e-3, 1e-2, 1, 10, 100]}

# Decision tree
dt_params = {
    "max_depth": [4, 6, 8, 10, 12, 14, 16, 20],
    "min_samples_split": [5, 10, 20, 30, 40, 50],
    "max_features": [0.2, 0.4, 0.6, 0.8, 1],
    "max_leaf_nodes": [8, 16, 32, 64, 128, 256],
}

# Random Forest
rf_params = {
    "bootstrap": [True, False],
    "max_depth": [2, 5, 10, 20, None],
    "max_features": ["log2", "sqrt"],
    "min_samples_leaf": [1, 2, 4],
    "min_samples_split": [2, 5, 10],
    "n_estimators": [100, 150, 200, 250],
}

# XGBoost
xgb_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": list(range(1, 10)),
    "learning_rate": [0.006, 0.007, 0.008, 0.05, 0.09],
    "min_child_weight": list(range(1, 10)),
}

In [36]:
from mlflow.tracking import MlflowClient

# Create an MLflow client
client = MlflowClient()

In [45]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Start MLflow experiment
mlflow.set_experiment("abalone-experiment")

# Variables to track the best model and RMSE
best_rmse = float("inf")
best_run_id = None
best_model_name = None

# Loop through each model and its corresponding parameter grid
for model_name, model in models.items():
    print(f"Training {model_name}")

    # Define parameter grid
    if model_name == "linear_regression":
        param_grid = lr_params
    elif model_name == "lasso":
        param_grid = lasso_params
    elif model_name == "decision_tree":
        param_grid = dt_params
    elif model_name == "random_forest":
        param_grid = rf_params
    elif model_name == "xgboost":
        param_grid = xgb_params

    # Perform RandomizedSearchCV
    regressor = RandomizedSearchCV(
        estimator=model,
        n_iter=10,
        param_distributions=param_grid,
        cv=3,
        scoring="neg_root_mean_squared_error",
    )

    # Fit the model
    search = regressor.fit(X_train, y_train)

    # Get the best model and RMSE
    best_model = search.best_estimator_
    rmse = -search.best_score_  # Negating because it's negative RMSE from scoring

    # Log experiment details with MLflow
    with mlflow.start_run(run_name=model_name) as run:
        # Log the best parameters and RMSE
        mlflow.log_params(search.best_params_)
        mlflow.log_metric("rmse", rmse)

        # Log the model
        mlflow.sklearn.log_model(best_model, model_name, input_example=)

        # Update best model if current one has lower RMSE
        if rmse < best_rmse:
            best_rmse = rmse
            best_run_id = run.info.run_id
            best_model_name = model_name

    print(f"{model_name} done, RMSE: {rmse}")

# Register the model with the lowest RMSE in MLflow Model Registry
if best_run_id and best_model_name:
    model_uri = f"runs:/{best_run_id}/{best_model_name}"
    print(f"Registering the best model: {best_model_name} with RMSE: {best_rmse}")
    mlflow.register_model(model_uri, best_model_name)

Training linear_regression




linear_regression done, RMSE: 2.179814440748505
Training lasso




lasso done, RMSE: 2.1797611054236348
Training decision_tree




decision_tree done, RMSE: 2.354716697047636
Training random_forest




random_forest done, RMSE: 2.1249213997107375
Training xgboost


Successfully registered model 'random_forest'.


xgboost done, RMSE: 2.149219586546676
Registering the best model: random_forest with RMSE: 2.1249213997107375


Created version '1' of model 'random_forest'.


In [49]:
client.search_experiments()

[<Experiment: artifact_location='file:///c:/Users/dorab/OneDrive%20-%20Ecole%20Polytechnique/Documents/DSB%20-%20HECxX/Year%202/MLOps/xhec-mlops-project-student/notebooks/mlruns/894000434234899817', creation_time=1729763952590, experiment_id='894000434234899817', last_update_time=1729763952590, lifecycle_stage='active', name='abalone-experiment', tags={}>,
 <Experiment: artifact_location='file:///c:/Users/dorab/OneDrive%20-%20Ecole%20Polytechnique/Documents/DSB%20-%20HECxX/Year%202/MLOps/xhec-mlops-project-student/notebooks/mlruns/0', creation_time=1729763943669, experiment_id='0', last_update_time=1729763943669, lifecycle_stage='active', name='Default', tags={}>]

In [50]:
!mlflow ui --host 0.0.0.0 --port 5002

^C
