In [5]:
import mlflow
from mlflow.models import infer_signature
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import optuna
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso, LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, KFold, GridSearchCV
import shap
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

from pathlib import Path

In [6]:
data= pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

experiment_name = "house_prices"
mlflow.set_tracking_uri("http://localhost:8080")
mlflow.set_experiment(experiment_name)


X = data.drop(columns= ["SalePrice", "Id"])
y = np.log(data["SalePrice"])
X_test = test.drop(columns = ["Id"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [7]:
num_cols = X.select_dtypes(include = ["number"]).columns
cat_cols = X.select_dtypes(include = ["object"]).columns

special_num_cols = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
other_num_cols = [col for col in num_cols if col not in special_num_cols]

masvnr_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=0))
])

lotfront_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

def garageyr_imputer(X):
    X_filled = X.copy()
    median_year = X['YearBuilt'].median()
    X_filled['GarageYrBlt'] = X_filled['GarageYrBlt'].fillna(median_year)
    return X_filled

garage_tf = Pipeline([
    ("imputer", FunctionTransformer(garageyr_imputer))
])

# Pipeline num√©rique autres colonnes
other_num_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

# Pipeline cat√©goriel
cat_tf = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combiner tout
preprocess = ColumnTransformer([
    ("lotfront", lotfront_tf, ['LotFrontage']),
    ("masvnr", masvnr_tf, ['MasVnrArea']),
    ("garage", garage_tf, ['GarageYrBlt', 'YearBuilt']),  # On passe YearBuilt aussi pour l'imputer
    ("other_num", other_num_tf, other_num_cols),
    ("cat", cat_tf, cat_cols)
])


In [8]:
cv = KFold(n_splits=3, shuffle=True, random_state=42)


def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 7),
        "iterations": trial.suggest_int("iterations", 500, 2000),
        "task_type": "CPU",
        "verbose": 0
    }

    model = CatBoostRegressor(**params)
    pipeline = Pipeline([
        ("preprocess", preprocess),
        ("model", model)
    ])
    
    scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="r2", n_jobs=-1)
    return np.mean(scores)

study = optuna.create_study(direction="maximize")

#
with mlflow.start_run(run_name="optuna_catboost"):

    study.optimize(objective, n_trials=25) 
    
    best_params = study.best_params
    best_r2 = study.best_value

    mlflow.log_metrics({"best_r2": best_r2})
    
    best_model = CatBoostRegressor(**best_params)
    pipeline = Pipeline([
        ("preprocess", preprocess),
        ("model", best_model)
    ])
    pipeline.fit(X_train, y_train)

    input_example = X_train.sample(5)
    signature = infer_signature(input_example, pipeline.predict(input_example))

    mlflow.sklearn.log_model(
        pipeline,
        name="optuna_catboost",
        signature=signature,
        input_example=input_example
    )
    
    # √âvaluation sur le set de validation
    eval_data = pd.DataFrame(X_val, columns=X.columns)
    eval_data["SalePrice"] = y_val

    model_uri = f"runs:/{mlflow.active_run().info.run_id}/optuna_catboost"
    result = mlflow.models.evaluate(
        model_uri,
        eval_data,
        targets="SalePrice",
        model_type="regressor"
    )

print("Best params:", best_params)
print("Best CV r2:", best_r2)
print("Evaluation metrics:", result.metrics)


[I 2025-10-05 19:37:32,383] A new study created in memory with name: no-name-9f1ea218-e465-4053-b2b7-9e2499fa1577
[I 2025-10-05 19:38:30,465] Trial 0 finished with value: 0.8934355528411784 and parameters: {'learning_rate': 0.04202712032585029, 'depth': 6, 'l2_leaf_reg': 4.109551501156431, 'iterations': 1683}. Best is trial 0 with value: 0.8934355528411784.
[I 2025-10-05 19:41:01,120] Trial 1 finished with value: 0.8869361150864199 and parameters: {'learning_rate': 0.03636206527960148, 'depth': 9, 'l2_leaf_reg': 6.74760054030921, 'iterations': 1317}. Best is trial 0 with value: 0.8934355528411784.
[I 2025-10-05 19:42:50,172] Trial 2 finished with value: 0.8859071873484815 and parameters: {'learning_rate': 0.04959357696385259, 'depth': 9, 'l2_leaf_reg': 2.1989845708014837, 'iterations': 1315}. Best is trial 0 with value: 0.8934355528411784.
[I 2025-10-05 19:43:13,505] Trial 3 finished with value: 0.8909308050113989 and parameters: {'learning_rate': 0.014367506954024015, 'depth': 7, 'l2_

0:	learn: 0.3820262	total: 49ms	remaining: 49.1s
1:	learn: 0.3733121	total: 50.9ms	remaining: 25.5s
2:	learn: 0.3649543	total: 54.7ms	remaining: 18.3s
3:	learn: 0.3570591	total: 57.4ms	remaining: 14.3s
4:	learn: 0.3501734	total: 59.8ms	remaining: 12s
5:	learn: 0.3424945	total: 62.2ms	remaining: 10.3s
6:	learn: 0.3351755	total: 66.4ms	remaining: 9.45s
7:	learn: 0.3283342	total: 68.8ms	remaining: 8.57s
8:	learn: 0.3218308	total: 70.9ms	remaining: 7.84s
9:	learn: 0.3158031	total: 73ms	remaining: 7.25s
10:	learn: 0.3097018	total: 75.3ms	remaining: 6.8s
11:	learn: 0.3037161	total: 78ms	remaining: 6.45s
12:	learn: 0.2977778	total: 80.2ms	remaining: 6.11s
13:	learn: 0.2923274	total: 83ms	remaining: 5.87s
14:	learn: 0.2866505	total: 85ms	remaining: 5.61s
15:	learn: 0.2813050	total: 87ms	remaining: 5.38s
16:	learn: 0.2760819	total: 88.9ms	remaining: 5.16s
17:	learn: 0.2707401	total: 91.1ms	remaining: 4.99s
18:	learn: 0.2660556	total: 93.3ms	remaining: 4.84s
19:	learn: 0.2610935	total: 95.3ms	re

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:00<00:00, 2438.75it/s]
2025/10/05 19:54:34 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


üèÉ View run optuna_catboost at: http://localhost:8080/#/experiments/1/runs/1fa4f779db424173bbdc1b0ad059d128
üß™ View experiment at: http://localhost:8080/#/experiments/1
Best params: {'learning_rate': 0.03567017275060585, 'depth': 5, 'l2_leaf_reg': 1.4678913191322192, 'iterations': 1004}
Best CV r2: 0.8936169152096199
Evaluation metrics: {'score': np.float64(0.9105378571023105), 'example_count': 292, 'mean_absolute_error': 0.08452115833213014, 'mean_squared_error': 0.016694845236133755, 'root_mean_squared_error': 0.1292085339137232, 'sum_on_target': np.float64(3503.3129158965576), 'mean_on_target': np.float64(11.997646972248484), 'r2_score': 0.9105378571023105, 'max_error': 0.8215112157399584, 'mean_absolute_percentage_error': 0.007118606390815896}


In [None]:
print("Best params:", best_params)
print("Best CV r2:", best_r2)
print("Evaluation metrics:", result.metrics)

Best params: {'learning_rate': 0.03567017275060585, 'depth': 5, 'l2_leaf_reg': 1.4678913191322192, 'iterations': 1004}
Best CV r2: 0.8936169152096199
Evaluation metrics: {'score': np.float64(0.9105378571023105), 'example_count': 292, 'mean_absolute_error': 0.08452115833213014, 'mean_squared_error': 0.016694845236133755, 'root_mean_squared_error': 0.1292085339137232, 'sum_on_target': np.float64(3503.3129158965576), 'mean_on_target': np.float64(11.997646972248484), 'r2_score': 0.9105378571023105, 'max_error': 0.8215112157399584, 'mean_absolute_percentage_error': 0.007118606390815896}


In [15]:
params =  {'learning_rate': 0.03567017275060585, 'depth': 5, 'l2_leaf_reg': 1.4678913191322192, 'iterations': 1004}

final_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", CatBoostRegressor(**params))
])

final_pipeline.fit(X_train, y_train)

0:	learn: 0.3820262	total: 1.63ms	remaining: 1.64s
1:	learn: 0.3733121	total: 4.17ms	remaining: 2.09s
2:	learn: 0.3649543	total: 5.58ms	remaining: 1.86s
3:	learn: 0.3570591	total: 6.97ms	remaining: 1.74s
4:	learn: 0.3501734	total: 8.23ms	remaining: 1.64s
5:	learn: 0.3424945	total: 9.55ms	remaining: 1.59s
6:	learn: 0.3351755	total: 10.8ms	remaining: 1.54s
7:	learn: 0.3283342	total: 12.4ms	remaining: 1.54s
8:	learn: 0.3218308	total: 13.7ms	remaining: 1.51s
9:	learn: 0.3158031	total: 15.1ms	remaining: 1.5s
10:	learn: 0.3097018	total: 16.7ms	remaining: 1.5s
11:	learn: 0.3037161	total: 18.1ms	remaining: 1.5s
12:	learn: 0.2977778	total: 20ms	remaining: 1.52s
13:	learn: 0.2923274	total: 21.4ms	remaining: 1.51s
14:	learn: 0.2866505	total: 24.4ms	remaining: 1.61s
15:	learn: 0.2813050	total: 26.6ms	remaining: 1.64s
16:	learn: 0.2760819	total: 28.1ms	remaining: 1.63s
17:	learn: 0.2707401	total: 29.8ms	remaining: 1.63s
18:	learn: 0.2660556	total: 31.4ms	remaining: 1.63s
19:	learn: 0.2610935	total:

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('lotfront', ...), ('masvnr', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function gar...x77f2f52ed3a0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [17]:

def get_feature_names(column_transformer):
    feature_names = []

    for name, transformer, columns in column_transformer.transformers_:
        if transformer == 'drop' or transformer == 'passthrough':
            continue

        # Si le transformateur a get_feature_names_out()
        if hasattr(transformer, 'get_feature_names_out'):
            try:
                names = transformer.get_feature_names_out(columns)
            except:
                names = columns
        else:
            names = columns

        feature_names.extend(names)

    return feature_names

# Usage
feature_names = get_feature_names(final_pipeline.named_steps["preprocess"])
catboost_model = final_pipeline.named_steps["model"]
importances = catboost_model.get_feature_importance()

import pandas as pd
feature_importances = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

feature_importances.head(15)




Unnamed: 0,feature,importance
6,OverallQual,18.337034
17,GrLivArea,15.043605
13,TotalBsmtSF,4.302416
14,1stFlrSF,4.262352
26,GarageCars,4.152465
5,LotArea,3.820401
10,BsmtFinSF1,3.649394
25,Fireplaces,3.53423
8,YearBuilt,3.235723
27,GarageArea,2.649152
