In [1]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import pandas as pd
from math import sqrt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgbm
from xgboost import XGBRegressor, plot_importance

In [2]:
df = pd.read_csv("../training_data/train_transformed.csv")
test_df = pd.read_csv("../training_data/test_transformed.csv")

In [3]:
X, y = df.drop('revenue', axis=1), df['revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=118)

In [4]:
# 📚 Modèles à tester

# Définir le pipeline : scaler suivi d’un modèle Ridge
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# Grille de paramètres : attention à préfixer avec 'ridge__'
params_ridge = {
    'ridge__alpha': [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    'ridge__fit_intercept': [True, False],
    'ridge__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso())
])

params_lasso = {
    'lasso__alpha': [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20],
    'lasso__fit_intercept': [True, False]
}
params_knn = {
    'n_neighbors' : [3, 5, 7, 9, 11],
}

knn_model = KNeighborsRegressor()

params_rf = {
    'max_depth': [10, 30, 35, 50, 65, 75, 100],
    'max_features': [.3, .4, .5, .6],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [30, 50, 100, 200]
}

rf = RandomForestRegressor()

params_lgbm = {
    'learning_rate': [.01, .1, .5, .7, .9, .95, .99, 1],
    'boosting': ['gbdt'],
    'metric': ['l1'],
    'feature_fraction': [.3, .4, .5, 1],
    'num_leaves': [20],
    'min_data': [10],
    'max_depth': [10],
    'n_estimators': [10, 30, 50, 100]
}

lgb = lgbm.LGBMRegressor()

params_xgb = {
    'learning_rate': [.1, .5, .7, .9, .95, .99, 1],
    'colsample_bytree': [.3, .4, .5, .6],
    'max_depth': [4],
    'alpha': [3],
    'subsample': [.5],
    'n_estimators': [30, 70, 100, 200]
}

xgb_model = XGBRegressor()

models = {
    "Ridge": GridSearchCV(ridge_pipeline, params_ridge, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1),
    "Lasso": GridSearchCV(lasso_pipeline, params_lasso, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1),
    "ElasticNet": ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=5e-2, cv=10, n_jobs=-1),
    "KNN": GridSearchCV(knn_model, params_knn, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1),
    "RandomForest": GridSearchCV(rf, params_rf, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1),
    "LightGBM": GridSearchCV(lgb, params_lgbm, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1),
    "XGBoost": GridSearchCV(xgb_model, params_xgb, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1),
}


In [5]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from math import sqrt

all_preds = []
mlflow.set_experiment("Restaurant Revenue Forecasting")

best_rmse = float("inf")
best_model = None
best_model_name = ""
best_run_id = ""

for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Scores
        rmse = sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        # Signature pour l'inférence
        signature = infer_signature(X_train, model.predict(X_train))

        # Logging MLflow
        mlflow.log_param("model_type", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2_score", r2)
        mlflow.log_metric("mape", mape)
        mlflow.sklearn.log_model(model, "model", signature=signature)

        print(f"✅ {name} terminé | RMSE = {rmse:.2f} | R² = {r2:.3f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
            best_model_name = name
            best_run_id = run.info.run_id

        all_preds.append(y_pred)

mlflow.set_tag("best_model", True)

print(f"\n🏆 Meilleur modèle : {best_model_name} | RMSE = {best_rmse:.2f}")
print(f"Run ID : {best_run_id}")


2025/08/01 00:59:15 INFO mlflow.tracking.fluent: Experiment with name 'Restaurant Revenue Forecasting' does not exist. Creating a new experiment.


✅ Ridge terminé | RMSE = 0.56 | R² = -0.037




✅ Lasso terminé | RMSE = 0.54 | R² = 0.029




✅ ElasticNet terminé | RMSE = 0.54 | R² = 0.056




✅ KNN terminé | RMSE = 0.51 | R² = 0.143




✅ RandomForest terminé | RMSE = 0.48 | R² = 0.253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 390
[LightGBM] [Info] Number of data points in the train set: 109, number of used features: 42
[LightGBM] [Info] Start training from score 15.200499




✅ LightGBM terminé | RMSE = 0.52 | R² = 0.102




✅ XGBoost terminé | RMSE = 0.51 | R² = 0.154

🏆 Meilleur modèle : RandomForest | RMSE = 0.48
Run ID : b30473580a504cbfbd9f7e717bcded0e


In [6]:
mlflow.sklearn.save_model(best_model, "best_model")