# Ensembles et Modèles Avancés



In [96]:
# Imports principaux
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.base import clone

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.svm import SVR

from scipy.stats import spearmanr

from xgboost import XGBRegressor




In [None]:

X = pd.read_csv("X_train_NHkHMNU.csv")
y = pd.read_csv("y_train_ZAN5mwg.csv")

df = pd.concat([X, y], axis=1)
df = df.drop(columns=df.columns[-2], axis=1)  

print(df.shape)
df.head()


(1494, 35)


Unnamed: 0,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,FR_NET_IMPORT,...,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET
0,206,FR,0.210099,-0.427458,-0.606523,0.606523,,0.69286,,-0.69286,...,-0.17268,-0.556356,-0.790823,-0.28316,-1.06907,-0.063404,0.339041,0.124552,-0.002445,0.028313
1,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,1.130838,...,-1.2403,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365,-0.112516
2,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,1.682587,...,-0.4807,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952,-0.18084
3,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,-0.56323,...,-1.114838,-0.50757,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948,-0.260356
4,818,FR,0.143807,-0.617038,-0.92499,0.92499,,0.990324,,-0.990324,...,-0.541465,-0.42455,-1.088158,-1.01156,0.614338,0.729495,0.245109,1.526606,2.614378,-0.071733


In [98]:
# Fonctions de feature engineering reprises du notebook d'origine

def drop_columns(df, columns):
    for c in columns:
        df.drop(columns=c, inplace=True, errors="ignore")


def compute_median(df):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    return df[numeric_cols].median()


def missing_values_changed_with_median(df, medians):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df[numeric_cols] = df[numeric_cols].fillna(medians[numeric_cols])
    return df


def add_threshold_columns(df: pd.DataFrame, column_name: str, threshold: float, way: str):
    message = column_name + "_THRESHOLD_" + str(threshold)
    if way == "sup":
        df[message] = df[column_name].where(df[column_name] >= threshold, 0)
    else:
        df[message] = df[column_name].where(df[column_name] <= threshold, 0)


def compute_quantiles(df, low=0.25, high=0.75, coeff=5):
    bounds = {}
    for column in df.select_dtypes(include=["number"]).columns:
        Q1 = df[column].quantile(low)
        Q3 = df[column].quantile(high)
        delta = Q3 - Q1
        bounds[column] = (Q1 - coeff * delta, Q3 + coeff * delta)
    return bounds


def outliers_filter(df, bounds):
    filter_ = pd.Series(True, index=df.index)
    for column, (low, high) in bounds.items():
        if column in df.columns:
            filter_ &= (df[column] >= low) & (df[column] <= high)
    return filter_


def feature_engineering(df, medians, threshold, columns_kept):
    columns_name = ["DE_NET_IMPORT", "FR_NET_IMPORT", "DE_FR_EXCHANGE"]
    drop_columns(df, columns_name)
    drop_columns(df, ["FR_COAL"])

    df = missing_values_changed_with_median(df, medians)

    for key, value in threshold.items():
        add_threshold_columns(df, key, value[0], value[1])

    to_keep = [c for c in df.columns if (c in columns_kept) or ("_THRESHOLD_" in c)]
    df = df[to_keep]
    return df


def transform_one_country(df, threshold, columns_kept, standardisation=True):
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=["TARGET"]), df["TARGET"], test_size=0.2, random_state=42
    )

    medians = compute_median(X_train)

    X_train = feature_engineering(X_train.copy(), medians, threshold, columns_kept)
    X_test = feature_engineering(X_test.copy(), medians, threshold, columns_kept)

    bounds = compute_quantiles(X_train)
    filter_ = outliers_filter(X_train, bounds)
    X_train = X_train[filter_]
    y_train = y_train[filter_]

    if standardisation:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
        X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test


def transform(df, threshold_fr, threshold_de, columns_kept_fr, columns_kept_de, standardisation=True):
    df_fr = df[df["COUNTRY"] == "FR"].copy()
    df_de = df[df["COUNTRY"] == "DE"].copy()

    X_train_fr, X_test_fr, y_train_fr, y_test_fr = transform_one_country(
        df_fr, threshold_fr, columns_kept_fr, standardisation=standardisation
    )

    X_train_de, X_test_de, y_train_de, y_test_de = transform_one_country(
        df_de, threshold_de, columns_kept_de, standardisation=standardisation
    )

    return (
        X_train_fr,
        X_test_fr,
        y_train_fr,
        y_test_fr,
        X_train_de,
        X_test_de,
        y_train_de,
        y_test_de,
    )


In [99]:
# Paramètres déjà validés dans le projet
threshold_fr = {
    "COAL_RET": [0.8, "inf"],
    "FR_CONSUMPTION": [1.5, "sup"],
    "FR_NUCLEAR": [-1.8, "inf"],
    "FR_HYDRO": [-0.4, "inf"],
}

threshold_de = {
    "DE_CONSUMPTION": [1.2, "sup"],
    "DE_NET_EXPORT": [-0.45, "sup"],
    "DE_WINDPOW": [0.3, "sup"],
}

columns_kept_fr = [
    "DE_NET_EXPORT",
    "DE_HYDRO",
    "DE_WINDPOW",
    "FR_WINDPOW",
    "GAS_RET",
    "CARBON_RET",
]

columns_kept_de = [
    "DE_NET_EXPORT",
    "DE_GAS",
    "DE_COAL",
    "DE_HYDRO",
    "DE_WINDPOW",
    "FR_WINDPOW",
    "DE_LIGNITE",
    "DE_RESIDUAL_LOAD",
    "DE_WIND",
]

X_train_fr, X_test_fr, y_train_fr, y_test_fr, X_train_de, X_test_de, y_train_de, y_test_de = transform(
    df,
    threshold_fr,
    threshold_de,
    columns_kept_fr,
    columns_kept_de,
)

print(
    f"France: {X_train_fr.shape[0]} train / {X_test_fr.shape[0]} test -- "
    f"Germany: {X_train_de.shape[0]} train / {X_test_de.shape[0]} test"
)


France: 527 train / 171 test -- Germany: 443 train / 129 test


In [100]:
# Fonctions utilitaires pour l'évaluation

def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation


def kfold_score(model, X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model_ = clone(model)
        model_.fit(X_train, y_train)
        y_pred = model_.predict(X_val)
        scores.append(spearman_corr(y_val, y_pred))
    return float(np.mean(scores)), float(np.std(scores))

#fonction pour recuperer les scores de chaque model directement
def evaluate_simple(model, X_train, y_train, X_test, y_test, k=5):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_test)
    rmse = np.sqrt(mse)
    kfold_mean, kfold_std = kfold_score(model, X_train, y_train, k=k)
    return {
        "spearman_train": float(spearman_corr(y_train, y_pred_train)),
        "spearman_test": float(spearman_corr(y_test, y_pred_test)),
        "spearman_kfold": float(kfold_mean),
        "spearman_std": float(kfold_std),
        "r2_test": float(r2_score(y_test, y_pred_test)),
        "rmse_test": float(rmse),
    }


In [101]:
# 1) Decision Tree Regressor avec recherche de paramètres + K-Fold
spearman_scorer = make_scorer(spearman_corr, greater_is_better=True)

fr_param_grid = {
    "max_depth": [3, 4, 5, 6, 7],
    "min_samples_leaf": [5, 10, 20, 30, 50],
    "min_samples_split": [5, 10, 20, 30],
}

de_param_grid = {
    "max_depth": [3, 4, 5, 7, 10, 12],
    "min_samples_leaf": [5, 10, 20, 30, 50],
    "min_samples_split": [5, 10, 20],
}

fr_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid=fr_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
de_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid=de_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
fr_search.fit(X_train_fr, y_train_fr)
fr_tree = fr_search.best_estimator_
print("France - meilleurs hyperparamètres :", fr_search.best_params_)

de_search.fit(X_train_de, y_train_de)
de_tree = de_search.best_estimator_
print("Germany - meilleurs hyperparamètres :", de_search.best_params_)

fr_kfold = kfold_score(fr_tree, X_train_fr, y_train_fr, k=5)
de_kfold = kfold_score(de_tree, X_train_de, y_train_de, k=5)

print(f"France - KFold Spearman: mean={fr_kfold[0]:.3f}, std={fr_kfold[1]:.3f}")
print(f"Germany - KFold Spearman: mean={de_kfold[0]:.3f}, std={de_kfold[1]:.3f}")


France - meilleurs hyperparamètres : {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 20}
Germany - meilleurs hyperparamètres : {'max_depth': 3, 'min_samples_leaf': 20, 'min_samples_split': 5}
France - KFold Spearman: mean=0.150, std=0.054
Germany - KFold Spearman: mean=0.202, std=0.139


In [None]:
# 2) Bagging sur Decision Tree 
base_fr_tree = DecisionTreeRegressor(**fr_search.best_params_, random_state=42)
base_de_tree = DecisionTreeRegressor(**de_search.best_params_, random_state=42)

bagging_fr_param_grid = {
    "n_estimators": [30, 50, 80],
    "max_samples": [0.6, 0.8, 1.0],
    "max_features": [0.8, 1.0],
    "bootstrap": [True, False],
}

bagging_de_param_grid = {
    "n_estimators": [30, 60, 100],
    "max_samples": [0.6, 0.9, 1.0],
    "max_features": [0.7, 1.0],
    "bootstrap": [True, False],
}

bagging_fr_search = GridSearchCV(
    BaggingRegressor(estimator=base_fr_tree, random_state=42),
    param_grid=bagging_fr_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
bagging_de_search = GridSearchCV(
    BaggingRegressor(estimator=base_de_tree, random_state=42),
    param_grid=bagging_de_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
bagging_fr_search.fit(X_train_fr, y_train_fr)
bagging_fr = bagging_fr_search.best_estimator_
print("France - meilleurs hyperparamètres bagging :", bagging_fr_search.best_params_)

bagging_de_search.fit(X_train_de, y_train_de)
bagging_de = bagging_de_search.best_estimator_
print("Germany - meilleurs hyperparamètres bagging :", bagging_de_search.best_params_)

bagging_fr_metrics = evaluate_simple(bagging_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
bagging_de_metrics = evaluate_simple(bagging_de, X_train_de, y_train_de, X_test_de, y_test_de)

print("France - Bagging optimisé")
print(f"  Spearman test  : {bagging_fr_metrics['spearman_test']:.3f}")
print(f"  Spearman train : {bagging_fr_metrics['spearman_train']:.3f}")
print("Germany - Bagging optimisé")
print(f"  Spearman test  : {bagging_de_metrics['spearman_test']:.3f}")
print(f"  Spearman train : {bagging_de_metrics['spearman_train']:.3f}")


France - meilleurs hyperparamètres bagging : {'bootstrap': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 30}
Germany - meilleurs hyperparamètres bagging : {'bootstrap': False, 'max_features': 0.7, 'max_samples': 0.9, 'n_estimators': 100}
France - Bagging optimisé
  Spearman test  : 0.071
  Spearman train : 0.285
Germany - Bagging optimisé
  Spearman test  : 0.211
  Spearman train : 0.487


Resultats pas bon non plus, le baggin n'apporte pas de changement important

In [103]:
# 3) SVM (SVR) avec GridSearch 
svr_param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto"],
}

svr_fr_search = GridSearchCV(
    SVR(epsilon=0.1),
    param_grid=svr_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
svr_fr_search.fit(X_train_fr, y_train_fr)
svr_fr = svr_fr_search.best_estimator_
print("France - meilleurs hyperparamètres SVR :", svr_fr_search.best_params_)

svr_de_search = GridSearchCV(
    SVR(epsilon=0.1),
    param_grid=svr_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
svr_de_search.fit(X_train_de, y_train_de)
svr_de = svr_de_search.best_estimator_
print("Germany - meilleurs hyperparamètres SVR :", svr_de_search.best_params_)

svr_fr_metrics = evaluate_simple(svr_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
svr_de_metrics = evaluate_simple(svr_de, X_train_de, y_train_de, X_test_de, y_test_de)

print("France - SVR optimisé", svr_fr_metrics)
print("Germany - SVR optimisé", svr_de_metrics)


France - meilleurs hyperparamètres SVR : {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Germany - meilleurs hyperparamètres SVR : {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
France - SVR optimisé {'spearman_train': 0.258054443488878, 'spearman_test': 0.22184475834882816, 'spearman_kfold': 0.21711805518813038, 'spearman_std': 0.10847272088092935, 'r2_test': 0.00932940725532061, 'rmse_test': 1.1818609558511215}
Germany - SVR optimisé {'spearman_train': 0.3152258369917798, 'spearman_test': 0.37359682468694105, 'spearman_kfold': 0.27390404160235177, 'spearman_std': 0.1278410551302806, 'r2_test': 0.026519451394977023, 'rmse_test': 0.963154599695358}


mieux que pour le decision tree mais le r² reste tres mauvais donc peu interessant.

In [104]:
# 4) Random Forest (simple réglages)
rf_fr = RandomForestRegressor(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
)

rf_de = RandomForestRegressor(
    n_estimators=400,
    max_depth=10,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1,
)

rf_fr_metrics = evaluate_simple(rf_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
rf_de_metrics = evaluate_simple(rf_de, X_train_de, y_train_de, X_test_de, y_test_de)

print("France - Random Forest", rf_fr_metrics)
print("Germany - Random Forest", rf_de_metrics)


France - Random Forest {'spearman_train': 0.47569906268285644, 'spearman_test': 0.19730253996123978, 'spearman_kfold': 0.14865402867054464, 'spearman_std': 0.07712237886188927, 'r2_test': 0.012698839215019797, 'rmse_test': 1.1798493932268128}
Germany - Random Forest {'spearman_train': 0.6802854052275341, 'spearman_test': 0.2264031753130591, 'spearman_kfold': 0.24473379313375196, 'spearman_std': 0.1083541617965652, 'r2_test': 0.03846469950693754, 'rmse_test': 0.9572270889975645}


In [105]:
xgb_fr = XGBRegressor(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2.0,
    reg_alpha=1.0,
    objective="reg:squarederror",
    random_state=42,
)


xgb_de = XGBRegressor(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.03,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_lambda=2.0,
    reg_alpha=1.0,
    objective="reg:squarederror",
    random_state=42,
)

xgb_fr_metrics = evaluate_simple(xgb_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
xgb_de_metrics = evaluate_simple(xgb_de, X_train_de, y_train_de, X_test_de, y_test_de)

print("France - XGBoost", xgb_fr_metrics)
print("Germany - XGBoost", xgb_de_metrics)


France - XGBoost {'spearman_train': 0.5488196556407897, 'spearman_test': 0.21093923904966902, 'spearman_kfold': 0.12671876636773632, 'spearman_std': 0.03497917600333644, 'r2_test': 0.01866402643504894, 'rmse_test': 1.17627971974826}
Germany - XGBoost {'spearman_train': 0.6806106030144915, 'spearman_test': 0.2218358676207514, 'spearman_kfold': 0.23729051335282772, 'spearman_std': 0.11896660178511997, 'r2_test': -0.0028596275131647086, 'rmse_test': 0.9775802928246266}


overfitting assez important


In [None]:
summary_rows = []

summary_rows.append({
    "model": "DecisionTree",
    "country": "FR",
    "spearman_train": fr_tree_holdout["spearman_train"],
    "spearman_test": fr_tree_holdout["spearman_test"],
    "spearman_std": fr_kfold[1],
    "note": "kfold + test",
})
summary_rows.append({
    "model": "DecisionTree",
    "country": "DE",
    "spearman_train": de_tree_holdout["spearman_train"],
    "spearman_test": de_tree_holdout["spearman_test"],
    "spearman_std": de_kfold[1],
    "note": "kfold + test",
})

summary_rows.append({
    "model": "DecisionTree_Bagging",
    "country": "FR",
    "spearman_train": bagging_fr_metrics["spearman_train"],
    "spearman_test": bagging_fr_metrics["spearman_test"],
    "spearman_std": bagging_fr_metrics["spearman_std"],
    "note": "test simple",
})
summary_rows.append({
    "model": "DecisionTree_Bagging",
    "country": "DE",
    "spearman_train": bagging_de_metrics["spearman_train"],
    "spearman_test": bagging_de_metrics["spearman_test"],
    "spearman_std": bagging_de_metrics["spearman_std"],
    "note": "test simple",
})

summary_rows.append({
    "model": "SVR",
    "country": "FR",
    "spearman_train": svr_fr_metrics["spearman_train"],
    "spearman_test": svr_fr_metrics["spearman_test"],
    "spearman_std": svr_fr_metrics["spearman_std"],
    "note": "test simple",
})
summary_rows.append({
    "model": "SVR",
    "country": "DE",
    "spearman_train": svr_de_metrics["spearman_train"],
    "spearman_test": svr_de_metrics["spearman_test"],
    "spearman_std": svr_de_metrics["spearman_std"],
    "note": "test simple",
})

summary_rows.append({
    "model": "RandomForest",
    "country": "FR",
    "spearman_train": rf_fr_metrics["spearman_train"],
    "spearman_test": rf_fr_metrics["spearman_test"],
    "spearman_std": rf_fr_metrics["spearman_std"],
    "note": "test simple",
})
summary_rows.append({
    "model": "RandomForest",
    "country": "DE",
    "spearman_train": rf_de_metrics["spearman_train"],
    "spearman_test": rf_de_metrics["spearman_test"],
    "spearman_std": rf_de_metrics["spearman_std"],
    "note": "test simple",
})

summary_rows.append({
    "model": "XGBoost",
    "country": "FR",
    "spearman_train": xgb_fr_metrics["spearman_train"],
    "spearman_test": xgb_fr_metrics["spearman_test"],
    "spearman_std": xgb_fr_metrics["spearman_std"],
    "note": "test simple",
})
summary_rows.append({
    "model": "XGBoost",
    "country": "DE",
    "spearman_train": xgb_de_metrics["spearman_train"],
    "spearman_test": xgb_de_metrics["spearman_test"],
    "spearman_std": xgb_de_metrics["spearman_std"],
    "note": "test simple",
})

summary_df = pd.DataFrame(summary_rows)
display(summary_df)



Unnamed: 0,model,country,spearman_train,spearman_test,spearman_std,note
0,DecisionTree,FR,0.279005,0.038782,0.053511,kfold + test
1,DecisionTree,DE,0.36309,0.123328,0.139108,kfold + test
2,DecisionTree_Bagging,FR,0.284972,0.070668,0.053511,test simple
3,DecisionTree_Bagging,DE,0.486671,0.210689,0.111035,test simple
4,SVR,FR,0.258054,0.221845,0.108473,test simple
5,SVR,DE,0.315226,0.373597,0.127841,test simple
6,RandomForest,FR,0.475699,0.197303,0.077122,test simple
7,RandomForest,DE,0.680285,0.226403,0.108354,test simple
8,XGBoost,FR,0.54882,0.210939,0.034979,test simple
9,XGBoost,DE,0.680611,0.221836,0.118967,test simple


## Conclusion rapide
- Le modèle Decision Tree = résultats modestes : il capte partiellement les relations entre les variables et fait de l'overfitting
- Le Bagging = améliore légèrement la stabilité du modèle en réduisant la variance, mais les gains en performance restent vachement limités.
- Le SVM = meilleure cohérence dans les prédictions et parvient à suivre plus fidèlement la tendance générale, sans modéliser précisément la variabilité des données.
- Le Random Forest = extrait des relations plus complexes et offre de meilleurs scores de corrélation tout en restant relativement robuste.
- XGBoost = le modèle le plus performant dans l’ensemble, mais il fait quand même pas mal d'overfitting.