# Modelling - Abalone Age Prediction

Ce notebook implémente un modèle de régression pour prédire l'âge des abalones (nombre de rings).

**Approche :**
- Modèle de régression linéaire simple
- Tracking des expériences avec MLflow
- Évaluation avec plusieurs métriques

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import mlflow
import mlflow.sklearn

# Set MLflow experiment
mlflow.set_experiment("abalone-age-prediction")


## 1. Chargement des données


In [None]:
# Load dataset
df = pd.read_csv("../abalone.csv")
print(f"Dataset shape: {df.shape}")
df.head()


## 2. Préparation des données


In [None]:
# Encode categorical variable (Sex)
le = LabelEncoder()
df['Sex_encoded'] = le.fit_transform(df['Sex'])

# Separate features and target
X = df.drop(['Rings', 'Sex'], axis=1)
y = df['Rings']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {X.columns.tolist()}")


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


## 3. Entraînement du modèle avec MLflow


In [None]:
# Start MLflow run
with mlflow.start_run(run_name="linear_regression_baseline"):
    # Train model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_mae = mean_absolute_error(y_train, y_pred_train)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)

    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    # Log metrics
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)

    # Log model
    mlflow.sklearn.log_model(model, "model")

    # Print metrics
    print("Model Performance:")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}")
    print(f"Train MAE: {train_mae:.4f}")
    print(f"Test MAE: {test_mae:.4f}")
    print(f"Train R²: {train_r2:.4f}")
    print(f"Test R²: {test_r2:.4f}")
