In [1]:
import mlflow
from mlflow.models import infer_signature
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso, LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
import shap
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
experiment_name = "house_prices"

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

mlflow.set_experiment(experiment_name)

train = pd.read_csv("data/train.csv")

y = np.log(train["SalePrice"])
X = train.drop(columns = ["Id", "SalePrice"])

X_train, X_val ,y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
num_cols = X.select_dtypes(include = ["number"]).columns
cat_cols = X.select_dtypes(include = ["object"]).columns


In [4]:
special_num_cols = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
other_num_cols = [col for col in num_cols if col not in special_num_cols]

masvnr_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0))
])

lotfront_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

def garageyr_imputer(X):
    X_filled = X.copy()
    median_year = X['YearBuilt'].median()
    X_filled['GarageYrBlt'] = X_filled['GarageYrBlt'].fillna(median_year)
    return X_filled

garage_tf = Pipeline([
    ("imputer", FunctionTransformer(garageyr_imputer))
])

# Pipeline num√©rique autres colonnes
other_num_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

# Pipeline cat√©goriel
cat_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combiner tout
preprocess = ColumnTransformer([
    ("lotfront", lotfront_tf, ['LotFrontage']),
    ("masvnr", masvnr_tf, ['MasVnrArea']),
    ("garage", garage_tf, ['GarageYrBlt', 'YearBuilt']),  # On passe YearBuilt aussi pour l'imputer
    ("other_num", other_num_tf, other_num_cols),
    ("cat", cat_tf, cat_cols)
])


In [None]:

models = {
    "random_forest": RandomForestRegressor(),
    "xgboost": XGBRegressor(),
    "lightgbm": LGBMRegressor(),
    "catboost": CatBoostRegressor(verbose=0),
    "lasso": Lasso(),
    "linear_regression": LinearRegression(),
    "ridge": RidgeCV(),
    "mlp": MLPRegressor(max_iter=1000)
}

# -------------------------
# Split dataset
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

eval_data = pd.DataFrame(X_test, columns=X.columns)
eval_data["SalePrice"] = y_test

results = {}
models = {
    "random_forest": RandomForestRegressor(),
    "xgboost": XGBRegressor(),
    "lightgbm": LGBMRegressor(),
    "catboost": CatBoostRegressor(),
    "lasso" : Lasso(),
    "linear_regression": LinearRegression(),
    "ridge": RidgeCV(),
    "mlp": MLPRegressor()
}

results = {}
eval_data = X_val.copy()
eval_data["SalePrice"] = y_val

# -------------------------
# Boucle d'entra√Ænement et log MLflow
# -------------------------
for model_name, model in models.items():
    with mlflow.start_run(run_name=f"eval_{model_name}_fe"):
        # Pipeline complet
        pipeline = Pipeline([
            ("preprocess", preprocess),
            ("model", model)
        ])
        
        pipeline.fit(X_train, y_train)
        
        # Exemple d'entr√©e pour MLflow
        input_example = X_train.sample(3)
        
        # Inf√©rer la signature
        signature = infer_signature(input_example, pipeline.predict(input_example))
        
        # Log le mod√®le
        mlflow.sklearn.log_model(
            pipeline, 
            name="model",  # <-- utiliser name
            signature=signature,
            input_example=input_example
        )
        model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
        
        # √âvaluation
        result = mlflow.models.evaluate(
            model_uri,
            eval_data,
            targets="SalePrice",
            model_type="regressor"
        )
        
        # Stocker les m√©triques
        results[model_name] = {
            "r2_score": result.metrics["r2_score"],
            "rmse": result.metrics["root_mean_squared_error"],
            "mae": result.metrics["mean_absolute_error"]
        }

# -------------------------
# Comparaison
# -------------------------
comparison_df = pd.DataFrame(results).T
print("Model Comparison:")
print(comparison_df[["r2_score", "rmse", "mae"]].round(3))


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 562.83it/s] 
2025/10/04 22:52:47 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_random_forest at: http://127.0.0.1:8080/#/experiments/1/runs/9b4e6c0d955346cbb73d59972847a728
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 650.28it/s]  
2025/10/04 22:53:02 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_xgboost at: http://127.0.0.1:8080/#/experiments/1/runs/ced5b361d7e446e2be14c2151bb5fdf1
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3283
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 179
[LightGBM] [Info] Start training from score 12.030652


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 863.00it/s]  
2025/10/04 22:53:19 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_lightgbm at: http://127.0.0.1:8080/#/experiments/1/runs/8f0437dd0e2f407aa22311cd8973609c
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1
Learning rate set to 0.04196
0:	learn: 0.3803769	total: 56.8ms	remaining: 56.7s
1:	learn: 0.3702672	total: 62.4ms	remaining: 31.1s
2:	learn: 0.3612183	total: 66.6ms	remaining: 22.1s
3:	learn: 0.3516728	total: 73ms	remaining: 18.2s
4:	learn: 0.3431257	total: 77.5ms	remaining: 15.4s
5:	learn: 0.3344090	total: 81.9ms	remaining: 13.6s
6:	learn: 0.3264107	total: 87.6ms	remaining: 12.4s
7:	learn: 0.3195233	total: 90.9ms	remaining: 11.3s
8:	learn: 0.3115236	total: 94.1ms	remaining: 10.4s
9:	learn: 0.3047306	total: 101ms	remaining: 9.96s
10:	learn: 0.2972191	total: 106ms	remaining: 9.54s
11:	learn: 0.2903880	total: 112ms	remaining: 9.26s
12:	learn: 0.2840207	total: 120ms	remaining: 9.08s
13:	learn: 0.2777755	total: 128ms	remaining: 8.98s
14:	learn: 0.2720062	total: 134ms	remaining: 8.78s
15:	learn: 0.2665294	total: 138ms	rema

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 1182.64it/s] 
2025/10/04 22:53:39 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_catboost at: http://127.0.0.1:8080/#/experiments/1/runs/59a5304c90b7400fb9e8530b4d660d92
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 749.08it/s] 
2025/10/04 22:53:44 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_lasso at: http://127.0.0.1:8080/#/experiments/1/runs/f4977782da5a4929ba548296b8784c00
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 1092.88it/s]
2025/10/04 22:53:50 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_linear_regression at: http://127.0.0.1:8080/#/experiments/1/runs/2ec00090ea184c338254fc707b761ade
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 1818.53it/s] 
2025/10/04 22:53:56 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_ridge at: http://127.0.0.1:8080/#/experiments/1/runs/7823f5093ebb462a8992b251293ca88f
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 1028.95it/s] 
2025/10/04 22:54:09 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run eval_mlp at: http://127.0.0.1:8080/#/experiments/1/runs/35a874ebbc074a0b91810d9c0ee1176c
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/1
Model Comparison:
                   r2_score   rmse    mae
random_forest         0.885  0.146  0.098
xgboost               0.880  0.149  0.099
lightgbm              0.895  0.140  0.091
catboost              0.911  0.129  0.085
lasso                 0.802  0.192  0.127
linear_regression     0.897  0.139  0.098
ridge               -38.460  2.714  1.668
mlp                -146.029  5.238  3.515
