# Ensembles et Modèles Avancés



In [60]:
# Imports principaux
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.base import clone

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.svm import SVR

from scipy.stats import spearmanr

from xgboost import XGBRegressor




In [61]:
# Chargement des données préparées précédemment
X = pd.read_csv("X_train_NHkHMNU.csv")
y = pd.read_csv("y_train_ZAN5mwg.csv")

df = pd.concat([X, y], axis=1)
df = df.drop(columns=df.columns[-2], axis=1)  # même astuce que dans l'autre notebook

print(df.shape)
df.head()


(1494, 35)


Unnamed: 0,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,DE_FR_EXCHANGE,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_NET_IMPORT,FR_NET_IMPORT,...,DE_RAIN,FR_RAIN,DE_WIND,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET
0,206,FR,0.210099,-0.427458,-0.606523,0.606523,,0.69286,,-0.69286,...,-0.17268,-0.556356,-0.790823,-0.28316,-1.06907,-0.063404,0.339041,0.124552,-0.002445,0.028313
1,501,FR,-0.022399,-1.003452,-0.022063,0.022063,-0.57352,-1.130838,0.57352,1.130838,...,-1.2403,-0.770457,1.522331,0.828412,0.437419,1.831241,-0.659091,0.047114,-0.490365,-0.112516
2,687,FR,1.395035,1.978665,1.021305,-1.021305,-0.622021,-1.682587,0.622021,1.682587,...,-0.4807,-0.313338,0.431134,0.487608,0.684884,0.114836,0.535974,0.743338,0.204952,-0.18084
3,720,DE,-0.983324,-0.849198,-0.839586,0.839586,-0.27087,0.56323,0.27087,-0.56323,...,-1.114838,-0.50757,-0.499409,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948,-0.260356
4,818,FR,0.143807,-0.617038,-0.92499,0.92499,,0.990324,,-0.990324,...,-0.541465,-0.42455,-1.088158,-1.01156,0.614338,0.729495,0.245109,1.526606,2.614378,-0.071733


In [62]:
# Fonctions de feature engineering reprises du notebook d'origine

def drop_columns(df, columns):
    for c in columns:
        df.drop(columns=c, inplace=True, errors="ignore")


def compute_median(df):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    return df[numeric_cols].median()


def missing_values_changed_with_median(df, medians):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df[numeric_cols] = df[numeric_cols].fillna(medians[numeric_cols])
    return df


def add_threshold_columns(df: pd.DataFrame, column_name: str, threshold: float, way: str):
    message = column_name + "_THRESHOLD_" + str(threshold)
    if way == "sup":
        df[message] = df[column_name].where(df[column_name] >= threshold, 0)
    else:
        df[message] = df[column_name].where(df[column_name] <= threshold, 0)


def compute_quantiles(df, low=0.25, high=0.75, coeff=5):
    bounds = {}
    for column in df.select_dtypes(include=["number"]).columns:
        Q1 = df[column].quantile(low)
        Q3 = df[column].quantile(high)
        delta = Q3 - Q1
        bounds[column] = (Q1 - coeff * delta, Q3 + coeff * delta)
    return bounds


def outliers_filter(df, bounds):
    filter_ = pd.Series(True, index=df.index)
    for column, (low, high) in bounds.items():
        if column in df.columns:
            filter_ &= (df[column] >= low) & (df[column] <= high)
    return filter_


def feature_engineering(df, medians, threshold, columns_kept):
    columns_name = ["DE_NET_IMPORT", "FR_NET_IMPORT", "DE_FR_EXCHANGE"]
    drop_columns(df, columns_name)
    drop_columns(df, ["FR_COAL"])

    df = missing_values_changed_with_median(df, medians)

    for key, value in threshold.items():
        add_threshold_columns(df, key, value[0], value[1])

    to_keep = [c for c in df.columns if (c in columns_kept) or ("_THRESHOLD_" in c)]
    df = df[to_keep]
    return df


def transform_one_country(df, threshold, columns_kept, standardisation=True):
    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=["TARGET"]), df["TARGET"], test_size=0.2, random_state=42
    )

    medians = compute_median(X_train)

    X_train = feature_engineering(X_train.copy(), medians, threshold, columns_kept)
    X_test = feature_engineering(X_test.copy(), medians, threshold, columns_kept)

    bounds = compute_quantiles(X_train)
    filter_ = outliers_filter(X_train, bounds)
    X_train = X_train[filter_]
    y_train = y_train[filter_]

    if standardisation:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
        X_test = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test


def transform(df, threshold_fr, threshold_de, columns_kept_fr, columns_kept_de, standardisation=True):
    df_fr = df[df["COUNTRY"] == "FR"].copy()
    df_de = df[df["COUNTRY"] == "DE"].copy()

    X_train_fr, X_test_fr, y_train_fr, y_test_fr = transform_one_country(
        df_fr, threshold_fr, columns_kept_fr, standardisation=standardisation
    )

    X_train_de, X_test_de, y_train_de, y_test_de = transform_one_country(
        df_de, threshold_de, columns_kept_de, standardisation=standardisation
    )

    return (
        X_train_fr,
        X_test_fr,
        y_train_fr,
        y_test_fr,
        X_train_de,
        X_test_de,
        y_train_de,
        y_test_de,
    )


In [63]:
# Paramètres déjà validés dans le projet
threshold_fr = {
    "COAL_RET": [0.8, "inf"],
    "FR_CONSUMPTION": [1.5, "sup"],
    "FR_NUCLEAR": [-1.8, "inf"],
    "FR_HYDRO": [-0.4, "inf"],
}

threshold_de = {
    "DE_CONSUMPTION": [1.2, "sup"],
    "DE_NET_EXPORT": [-0.45, "sup"],
    "DE_WINDPOW": [0.3, "sup"],
}

columns_kept_fr = [
    "DE_NET_EXPORT",
    "DE_HYDRO",
    "DE_WINDPOW",
    "FR_WINDPOW",
    "GAS_RET",
    "CARBON_RET",
]

columns_kept_de = [
    "DE_NET_EXPORT",
    "DE_GAS",
    "DE_COAL",
    "DE_HYDRO",
    "DE_WINDPOW",
    "FR_WINDPOW",
    "DE_LIGNITE",
    "DE_RESIDUAL_LOAD",
    "DE_WIND",
]

X_train_fr, X_test_fr, y_train_fr, y_test_fr, X_train_de, X_test_de, y_train_de, y_test_de = transform(
    df,
    threshold_fr,
    threshold_de,
    columns_kept_fr,
    columns_kept_de,
)

print(
    f"France: {X_train_fr.shape[0]} train / {X_test_fr.shape[0]} test -- "
    f"Germany: {X_train_de.shape[0]} train / {X_test_de.shape[0]} test"
)


France: 527 train / 171 test -- Germany: 443 train / 129 test


In [64]:
# Fonctions utilitaires pour l'évaluation

def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation


def kfold_score(model, X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model_ = clone(model)
        model_.fit(X_train, y_train)
        y_pred = model_.predict(X_val)
        scores.append(spearman_corr(y_val, y_pred))
    return float(np.mean(scores)), float(np.std(scores))

#fonction pour recuperer les scores de chaque model directement
def evaluate_simple(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred_test)
    rmse = np.sqrt(mse)
    return {
        "spearman_train": float(spearman_corr(y_train, y_pred_train)),
        "spearman_test": float(spearman_corr(y_test, y_pred_test)),
        "r2_test": float(r2_score(y_test, y_pred_test)),
        "rmse_test": float(rmse),
    }


In [65]:
# 1) Decision Tree Regressor avec recherche de paramètres + K-Fold
spearman_scorer = make_scorer(spearman_corr, greater_is_better=True)

fr_param_grid = {
    "max_depth": [3, 4, 5, 6, 7],
    "min_samples_leaf": [5, 10, 20, 30, 50],
    "min_samples_split": [5, 10, 20, 30],
}

de_param_grid = {
    "max_depth": [3, 4, 5, 7, 10, 12],
    "min_samples_leaf": [5, 10, 20, 30, 50],
    "min_samples_split": [5, 10, 20],
}

fr_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid=fr_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
de_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid=de_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
fr_search.fit(X_train_fr, y_train_fr)
fr_tree = fr_search.best_estimator_
print("France - meilleurs hyperparamètres :", fr_search.best_params_)

de_search.fit(X_train_de, y_train_de)
de_tree = de_search.best_estimator_
print("Germany - meilleurs hyperparamètres :", de_search.best_params_)

fr_kfold = kfold_score(fr_tree, X_train_fr, y_train_fr, k=5)
de_kfold = kfold_score(de_tree, X_train_de, y_train_de, k=5)

print(f"France - KFold Spearman: mean={fr_kfold[0]:.3f}, std={fr_kfold[1]:.3f}")
print(f"Germany - KFold Spearman: mean={de_kfold[0]:.3f}, std={de_kfold[1]:.3f}")


France - meilleurs hyperparamètres : {'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 20}
Germany - meilleurs hyperparamètres : {'max_depth': 3, 'min_samples_leaf': 20, 'min_samples_split': 5}
France - KFold Spearman: mean=0.150, std=0.054
Germany - KFold Spearman: mean=0.202, std=0.139


In [None]:
# 2) Bagging sur Decision Tree avec petite recherche d'hyperparamètres
base_fr_tree = DecisionTreeRegressor(**fr_search.best_params_, random_state=42)
base_de_tree = DecisionTreeRegressor(**de_search.best_params_, random_state=42)

bagging_fr_param_grid = {
    "n_estimators": [30, 50, 80],
    "max_samples": [0.6, 0.8, 1.0],
    "max_features": [0.8, 1.0],
    "bootstrap": [True, False],
}

bagging_de_param_grid = {
    "n_estimators": [30, 60, 100],
    "max_samples": [0.6, 0.9, 1.0],
    "max_features": [0.7, 1.0],
    "bootstrap": [True, False],
}

bagging_fr_search = GridSearchCV(
    BaggingRegressor(estimator=base_fr_tree, random_state=42),
    param_grid=bagging_fr_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
bagging_fr_search.fit(X_train_fr, y_train_fr)
bagging_fr = bagging_fr_search.best_estimator_
print("France - meilleurs hyperparamètres bagging :", bagging_fr_search.best_params_)

bagging_de_search = GridSearchCV(
    BaggingRegressor(estimator=base_de_tree, random_state=42),
    param_grid=bagging_de_param_grid,
    scoring=spearman_scorer,
    cv=5,
    n_jobs=-1,
)
bagging_de_search.fit(X_train_de, y_train_de)
bagging_de = bagging_de_search.best_estimator_
print("Germany - meilleurs hyperparamètres bagging :", bagging_de_search.best_params_)

bagging_fr_metrics = evaluate_simple(bagging_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
bagging_de_metrics = evaluate_simple(bagging_de, X_train_de, y_train_de, X_test_de, y_test_de)

print("France - Bagging optimisé")
print(f"  Spearman hold-out: {bagging_fr_metrics['spearman_test']:.3f}")
print(f"  Spearman train   : {bagging_fr_metrics['spearman_train']:.3f}")
print("Germany - Bagging optimisé")
print(f"  Spearman hold-out: {bagging_de_metrics['spearman_test']:.3f}")
print(f"  Spearman train   : {bagging_de_metrics['spearman_train']:.3f}")


France - Bagging vs KFold
  Spearman hold-out: 0.180
  Spearman train   : 0.432
Germany - Bagging vs KFold
  Spearman hold-out: 0.206
  Spearman train   : 0.580


Les cellules précédentes permettent de comparer directement la moyenne du Spearman obtenu avec le K-Fold (référence du pipeline actuel) et le score hold-out du Bagging. On obtient ainsi une vision claire du gain/perte de stabilité en remplaçant la validation croisée par un ensemble baggé sur les arbres.


In [43]:
# 3) SVM (SVR) sur les deux pays
svr_fr = SVR(kernel="rbf", C=1.0, epsilon=0.1, gamma="scale")
svr_de = SVR(kernel="rbf", C=1.0, epsilon=0.1, gamma="scale")

svr_fr_metrics = evaluate_simple(svr_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
svr_de_metrics = evaluate_simple(svr_de, X_train_de, y_train_de, X_test_de, y_test_de)

print("France - SVR", svr_fr_metrics)
print("Germany - SVR", svr_de_metrics)


France - SVR {'spearman_train': 0.6436868657986635, 'spearman_test': 0.24208620856210022, 'r2_test': 0.014304828204079856, 'rmse_test': 1.1788894043251976}
Germany - SVR {'spearman_train': 0.6173299570827954, 'spearman_test': 0.2812723613595707, 'r2_test': -0.000896226328831462, 'rmse_test': 0.9766228693649052}


In [44]:
# 4) Random Forest (simple réglages)
rf_fr = RandomForestRegressor(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
)

rf_de = RandomForestRegressor(
    n_estimators=400,
    max_depth=10,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1,
)

rf_fr_metrics = evaluate_simple(rf_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
rf_de_metrics = evaluate_simple(rf_de, X_train_de, y_train_de, X_test_de, y_test_de)

print("France - Random Forest", rf_fr_metrics)
print("Germany - Random Forest", rf_de_metrics)


France - Random Forest {'spearman_train': 0.47569906268285644, 'spearman_test': 0.19730253996123978, 'r2_test': 0.012698839215019575, 'rmse_test': 1.1798493932268128}
Germany - Random Forest {'spearman_train': 0.6802854052275341, 'spearman_test': 0.2264031753130591, 'r2_test': 0.03846469950693743, 'rmse_test': 0.9572270889975646}


In [45]:
# 5) XGBoost (si disponible dans l'environnement)
if XGBOOST_AVAILABLE:
    xgb_fr = XGBRegressor(
        n_estimators=600,
        max_depth=4,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1,
    )

    xgb_de = XGBRegressor(
        n_estimators=800,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1,
    )

    xgb_fr_metrics = evaluate_simple(xgb_fr, X_train_fr, y_train_fr, X_test_fr, y_test_fr)
    xgb_de_metrics = evaluate_simple(xgb_de, X_train_de, y_train_de, X_test_de, y_test_de)

    print("France - XGBoost", xgb_fr_metrics)
    print("Germany - XGBoost", xgb_de_metrics)
else:
    print("XGBoost n'est pas installé. Lancer `pip install xgboost` puis ré exécuter cette cellule.")


France - XGBoost {'spearman_train': 0.9459175459177872, 'spearman_test': 0.22669152248277447, 'r2_test': -0.04491863252014694, 'rmse_test': 1.2137884104041166}
Germany - XGBoost {'spearman_train': 0.9998424736950918, 'spearman_test': 0.14881484794275493, 'r2_test': -0.11894051411467887, 'rmse_test': 1.0326088987650484}


In [46]:
# 6) Tableau récapitulatif rapide (exécuter après les cellules précédentes)
summary_rows = []

summary_rows.append({
    "model": "DecisionTree_KFold",
    "country": "FR",
    "spearman": fr_kfold[0],
    "std": fr_kfold[1],
    "note": "validation croisée",
})
summary_rows.append({
    "model": "DecisionTree_KFold",
    "country": "DE",
    "spearman": de_kfold[0],
    "std": de_kfold[1],
    "note": "validation croisée",
})

summary_rows.append({
    "model": "DecisionTree_Bagging",
    "country": "FR",
    "spearman": bagging_fr_metrics["spearman_test"],
    "std": np.nan,
    "note": "hold-out",
})
summary_rows.append({
    "model": "DecisionTree_Bagging",
    "country": "DE",
    "spearman": bagging_de_metrics["spearman_test"],
    "std": np.nan,
    "note": "hold-out",
})

summary_rows.append({
    "model": "SVR",
    "country": "FR",
    "spearman": svr_fr_metrics["spearman_test"],
    "std": np.nan,
    "note": "hold-out",
})
summary_rows.append({
    "model": "SVR",
    "country": "DE",
    "spearman": svr_de_metrics["spearman_test"],
    "std": np.nan,
    "note": "hold-out",
})

summary_rows.append({
    "model": "RandomForest",
    "country": "FR",
    "spearman": rf_fr_metrics["spearman_test"],
    "std": np.nan,
    "note": "hold-out",
})
summary_rows.append({
    "model": "RandomForest",
    "country": "DE",
    "spearman": rf_de_metrics["spearman_test"],
    "std": np.nan,
    "note": "hold-out",
})

if XGBOOST_AVAILABLE:
    summary_rows.append({
        "model": "XGBoost",
        "country": "FR",
        "spearman": xgb_fr_metrics["spearman_test"],
        "std": np.nan,
        "note": "hold-out",
    })
    summary_rows.append({
        "model": "XGBoost",
        "country": "DE",
        "spearman": xgb_de_metrics["spearman_test"],
        "std": np.nan,
        "note": "hold-out",
    })

summary_df = pd.DataFrame(summary_rows)
display(summary_df)



Unnamed: 0,model,country,spearman,std,note
0,DecisionTree_KFold,FR,0.058745,0.063139,validation croisée
1,DecisionTree_KFold,DE,0.054465,0.083993,validation croisée
2,DecisionTree_Bagging,FR,0.180239,,hold-out
3,DecisionTree_Bagging,DE,0.2063,,hold-out
4,SVR,FR,0.242086,,hold-out
5,SVR,DE,0.281272,,hold-out
6,RandomForest,FR,0.197303,,hold-out
7,RandomForest,DE,0.226403,,hold-out
8,XGBoost,FR,0.226692,,hold-out
9,XGBoost,DE,0.148815,,hold-out


## Conclusion rapide
- Les arbres ont maintenant deux références : K-Fold (pipeline historique) et Bagging (sans CV) pour vérifier l'impact de l'agrégation.
- Les modèles SVR, Random Forest et XGBoost (optionnel) suivent exactement les mêmes données transformées et permettent de comparer facilement les Spearman.
- Il suffit de lancer les cellules dans l'ordre pour mettre à jour les scores avant de reporter les meilleurs résultats dans le rapport final.
