In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
from mlflow.models import infer_signature
from mlflow import MlflowClient

# Models

In [2]:
df = pd.read_csv("../assets/data/abalone.csv")
df["Age"] = df["Rings"] + 1.5
df = df.drop(columns="Rings")
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,16.5
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,8.5
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,10.5
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,11.5
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,8.5


In [3]:
# --- filter outliers ---
numerical_cols = [
    "Length",
    "Diameter",
    "Height",
    "Whole weight",
    "Shucked weight",
    "Viscera weight",
    "Shell weight",
]
categorical_features = ["Sex"]

df_filtered = df.copy()
for col in numerical_cols:
    Q1 = df_filtered[col].quantile(0.25)
    Q3 = df_filtered[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_filtered = df_filtered[
        (df_filtered[col].isnull())
        | ((df_filtered[col] >= lower_bound) & (df_filtered[col] <= upper_bound))
    ]

print(f"Original shape: {df.shape}")
print(f"After IQR filtering: {df_filtered.shape}")

Original shape: (4177, 9)
After IQR filtering: (4013, 9)


In [4]:
# --- set mlflow experiment ---
client = MlflowClient()
mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment("Age_Prediction_Experiment")

<Experiment: artifact_location='/Users/joaosilva/Current/HEC/MLOps/xhec-mlops-2025-project/notebooks/../mlruns/839534043173012257', creation_time=1761212772548, experiment_id='839534043173012257', last_update_time=1761212772548, lifecycle_stage='active', name='Age_Prediction_Experiment', tags={}>

In [5]:
# --- fit models ---
target = "Age"
X = df_filtered.drop(columns=[target])
y = df_filtered[target]
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(exclude=[np.number]).columns

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = ColumnTransformer(
    [("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)],
    remainder="passthrough",
)

models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}

results = {}
for name, model in models.items():
    with mlflow.start_run() as run:
        run_id = run.info.run_id

        pipe = Pipeline([("preprocessor", preprocessor), ("model", model)])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        input_example = X_test.iloc[:5]
        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(
            pipe,
            artifact_path="models",
            input_example=input_example,
            signature=signature,
        )
        mlflow.register_model(f"runs:/{run_id}/models", "age_predicter")

        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)
        mlflow.log_params({f"{name}_{k}": v for k, v in model.get_params().items()})

        results[name] = {"MAE": mae, "RMSE": rmse, "R2": r2}

results_df = pd.DataFrame(results).T.sort_values("RMSE")
print("\nModel performance:")
print(results_df.round(3))

Registered model 'age_predicter' already exists. Creating a new version of this model...
Created version '5' of model 'age_predicter'.
Registered model 'age_predicter' already exists. Creating a new version of this model...
Created version '6' of model 'age_predicter'.
Registered model 'age_predicter' already exists. Creating a new version of this model...
Created version '7' of model 'age_predicter'.
Registered model 'age_predicter' already exists. Creating a new version of this model...



Model performance:
                    MAE   RMSE     R2
GradientBoosting  1.487  2.088  0.559
RandomForest      1.515  2.096  0.556
LinearRegression  1.550  2.112  0.549
Ridge             1.552  2.124  0.544


Created version '8' of model 'age_predicter'.
