# Feature engineering

## Library importation

In [38]:
# Traitement de données
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.base import clone
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor


## Download the dataset

In [39]:
X = pd.read_csv("X_train_NHkHMNU.csv")
y = pd.read_csv("y_train_ZAN5mwg.csv")

df = pd.concat([X, y], axis=1)

df = df.drop(df.columns[-2], axis=1)

## Feature Engineering

Feature engineering is a key step in a machine Learning project. This step prepares the data for the models. Here are the steps we followed to prepare the dataset : 

**Remove columns that have -1 correlation**

Some vairables have -1 correlation :
- `DE_NET_EXPORT` and `DE_NET_IMPORT`
- `FR_NET_EXPORT` and `FR_NET_IMPORT`
- `DE_FR_EXCHANGE` and `FR_DE_EXCHANGE`

Moreover they have the same correlation with the other variables. So keeping both variables doesn't add meaning full information. That is why we chose to drop one of the variables from each -1 correlation.

**Remove `FR_COAL` variable**

This variable is not diversified. Thus its values are not interesting to keep.

**Split the dataset**

As decided thanks to the data analysis, we splited the dataset into two : french and german dataset.

**Remove Nan Values from both dataset**

The proportion of Nan values as well as the few rows we have for each dataset were the reasons why we chose to replace nan values by the median of each column.

**Create additionnal columns according to a Threshold**

Seuils pour df_fr
- COAL_RET < 0.8
- FR_CONSUMPTION > 1.5
- FR_NUCLEAR < -1.8
- FR_HYDRO < -0.4

Seuils pour df_de
- DE_CONSUMPTION > 1.2
- DE_NET_EXPORT > -0.45
- DE_WINDPOW > 0.3

Transformation "ReLu"

**Remove Columns that have a low correlation with the TARGET variable**

Each variables whose spearman corelation with the `TARGET` variable is lower than 0.05 will be removed from the dataset. We don't consider those variables to have a correlation high enough to have a positive impact on models' performance.

#### Global variables

In [40]:
threshold_fr = {"COAL_RET": [0.8, "inf"],
                "FR_CONSUMPTION": [1.5, "sup"],
                "FR_NUCLEAR": [-1.8, "inf"],
                "FR_HYDRO":[-0.4, "inf"]                
                }

threshold_de = {"DE_CONSUMPTION": [1.2, "sup"],
                "DE_NET_EXPORT": [-0.45, "sup"],
                "DE_WINDPOW": [0.3, "sup"]
}

# COLONNES RECUPEREES TEMPORAIREMENT A LA MAIN CAR SEPARATIONN DES FICHIERS ANALYSES ET ENGINEERING
# A RECUPER DES VARIBALES QUAND LE RASSEMBLEMENT DES FICHIERS SERA FAIT
columns_kept_fr = ["DE_NET_EXPORT",
                "DE_HYDRO",
                "DE_WINDPOW",
                "FR_WINDPOW",
                "GAS_RET",
                "CARBON_RET"]

columns_kept_de = ["DE_NET_EXPORT",
                "DE_GAS",
                "DE_COAL",
                "DE_HYDRO",
                "DE_WINDPOW",
                "FR_WINDPOW",
                "DE_LIGNITE",
                "DE_RESIDUAL_LOAD",
                "DE_WIND"]

In [41]:
def drop_columns(df, columns):
    for c in columns:
        df.drop(columns=c, inplace=True, errors="ignore")

def compute_median(df):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    medians = df[numeric_cols].median()
    return medians

def missing_values_changed_with_median(df, medians):
    numeric_cols = df.select_dtypes(include=["number"]).columns
    df[numeric_cols] = df[numeric_cols].fillna(medians[numeric_cols])
    return df

def add_threshold_columns(df: pd.DataFrame, column_name: str, threshold: float, way: str):
    message = column_name + "_THRESHOLD_" + str(threshold)
    # when way = "sup", we want to keep only values that are higher than the threshold
    # else we keep the values that are lower than the threshold
    if way == "sup":
        df[message] = df[column_name].where(df[column_name] >= threshold, 0)
    else:
        df[message] = df[column_name].where(df[column_name] <= threshold, 0)

def compute_quantiles(df, low = 0.25, high = 0.75, coeff=5):
    bounds = {}
    for column in df.select_dtypes(include=["number"]).columns:
        Q1 = df[column].quantile(low)
        Q3 = df[column].quantile(high)
        delta = Q3 - Q1
        lower_bound = Q1 - coeff * delta
        upper_bound = Q3 + coeff * delta
        bounds[column] = (lower_bound, upper_bound)
    return bounds

def outliers_filter(df, bounds):
    filter_ = pd.Series(True, index=df.index)
    for column, (low, high) in bounds.items():
        if column in df.columns:
            filter_ &= (df[column] >= low) & (df[column] <= high)
    return filter_

def feature_engineering(df, medians, threshold, columns_kept):
    # remove unecessary columns
    columns_name = ["DE_NET_IMPORT", "FR_NET_IMPORT", "DE_FR_EXCHANGE"]
    drop_columns(df, columns_name)

    # remove FR_COAL
    drop_columns(df, ["FR_COAL"])

    # modify missing values
    df = missing_values_changed_with_median(df, medians)

    # add threshold columns to the french dataset
    for key, value in threshold.items():
        add_threshold_columns(df, key, value[0], value[1])

    # drop columns that are not in the list or that have not _THRESHOLD_ in their name
    to_keep = [c for c in df.columns if (c in columns_kept) or ("_THRESHOLD_" in c)]
    df = df[to_keep]
    
    return df

def transform_one_country(df, threshold, columns_kept, standardisation = True):
    X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["TARGET"]), df["TARGET"], test_size=0.2, random_state=42)

    medians = compute_median(X_train)

    X_train = feature_engineering(X_train, medians, threshold, columns_kept)
    X_test = feature_engineering(X_test, medians, threshold, columns_kept)

    # filter : remove outliers from the train data
    bounds = compute_quantiles(X_train)
    filter_ = outliers_filter(X_train, bounds)
    X_train = X_train[filter_]
    y_train = y_train[filter_]

    if standardisation:
        # Standardisation
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled  = scaler.transform(X_test)

        # X_train_fr_scaled, X_test_fr_scaled, X_train_de_scaled, X_test_de_scaled are not dataframe, 
        # we prefer to work with dataframe to keep columns name
        X_train = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
        X_test  = pd.DataFrame(X_test_scaled,  index=X_test.index,  columns=X_test.columns)


    return X_train, X_test, y_train, y_test

def transform(df, threshold_fr, threshold_de, columns_kept_fr, columns_kept_de, standardisation):
    # split the dataset
    df_fr = df[df["COUNTRY"] == "FR"].copy()
    df_de = df[df["COUNTRY"] == "DE"].copy()

    X_train_fr, X_test_fr, y_train_fr, y_test_fr = transform_one_country(
        df_fr, threshold_fr, columns_kept_fr, standardisation=standardisation
    )

    X_train_de, X_test_de, y_train_de, y_test_de = transform_one_country(
        df_de, threshold_de, columns_kept_de, standardisation=standardisation
    )

    return (
        X_train_fr, X_test_fr, y_train_fr, y_test_fr,
        X_train_de, X_test_de, y_train_de, y_test_de
    )

## Pipeline for all models

We observe that if our features engineering seems very relevant for simple and interpretable models, however models that handle better the complexity and non linear relationsip didn't require as feature engineering than a simple linear regression. For that purpose the goal of this part is to do a general pipeline using the last feature engineering pipeline to have a flexible way of testing new models. Furthermore since the observation of an important part of outliers in the French side, make the relationships very noisy, we will remove the extreme outliers, only on training data. We also aim to have the possibilitie to use a different model for France and Allemagne since the optimal model for each could be different. Finally in our objective to avoid overfitting we will use K-fold optimization.

In [42]:
def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation

def kfold_score(model, X, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []

    for train_idx, test_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model_ = clone(model)  
        model_.fit(X_train, y_train)
        y_pred = model_.predict(X_val)

        scores.append(spearman_corr(y_val, y_pred))

    return np.mean(scores), np.std(scores)

def build_bagging_decision_trees(fr_tree_params, de_tree_params, fr_bagging_params, de_bagging_params):
    fr_tree = DecisionTreeRegressor(random_state=42, **fr_tree_params)
    de_tree = DecisionTreeRegressor(random_state=42, **de_tree_params)

    bagging_fr = BaggingRegressor(estimator=fr_tree, **fr_bagging_params)
    bagging_de = BaggingRegressor(estimator=de_tree, **de_bagging_params)

    return bagging_fr, bagging_de

In [43]:
def pipeline_all(
    df,
    fr_model,
    de_model,
    threshold_fr=threshold_fr,
    threshold_de=threshold_de,
    columns_kept_fr=columns_kept_fr,
    columns_kept_de=columns_kept_de,
    feature_engineering=True,
    standardisation=True,
    use_grid=False,
    k=5,
    cv_mode_label=None,
):
    # comparison with or without feature engineering
    if feature_engineering:
        (X_train_fr, X_test_fr, y_train_fr, y_test_fr,
            X_train_de, X_test_de, y_train_de, y_test_de) = transform(
                                                            df,
                                                            threshold_fr=threshold_fr,
                                                            threshold_de=threshold_de,
                                                            columns_kept_fr=columns_kept_fr,
                                                            columns_kept_de=columns_kept_de,
                                                            standardisation=standardisation)
    else:
        df_fr = df[df["COUNTRY"] == "FR"].drop(columns="COUNTRY").fillna(0)
        df_de = df[df["COUNTRY"] == "DE"].drop(columns="COUNTRY").fillna(0)
        X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(df_fr.drop(columns=["TARGET"]), df_fr["TARGET"], test_size=0.2, random_state=42)
        X_train_de, X_test_de, y_train_de, y_test_de = train_test_split(df_de.drop(columns=["TARGET"]), df_de["TARGET"], test_size=0.2, random_state=42)

    if use_grid:
        # france
        fr_model.fit(X_train_fr, y_train_fr)
        fr_mean = fr_model.best_score_
        fr_cv_scores = fr_model.cv_results_["mean_test_score"]
        fr_std = fr_cv_scores.std()
        fr_estimator = fr_model.best_estimator_

        # germany
        de_model.fit(X_train_de, y_train_de)
        de_mean = de_model.best_score_
        de_cv_scores = de_model.cv_results_["mean_test_score"]
        de_std = de_cv_scores.std()
        de_estimator = de_model.best_estimator_
    else:
        # k_fold
        fr_mean, fr_std = kfold_score(fr_model, X_train_fr, y_train_fr, k=k)
        de_mean, de_std = kfold_score(de_model, X_train_de, y_train_de, k=k)

        fr_estimator = fr_model
        de_estimator = de_model

        fr_estimator.fit(X_train_fr, y_train_fr)
        de_estimator.fit(X_train_de, y_train_de)

    # Test evaluation
    y_pred_test_fr = fr_estimator.predict(X_test_fr)
    y_pred_test_de = de_estimator.predict(X_test_de)

    fr_test_score = spearman_corr(y_test_fr, y_pred_test_fr)
    de_test_score = spearman_corr(y_test_de, y_pred_test_de)

    # Global Spearman
    y_true_global = np.concatenate([y_test_fr, y_test_de])
    y_pred_global = np.concatenate([y_pred_test_fr, y_pred_test_de])
    spearman_global = spearman_corr(y_true_global, y_pred_global)

    mode_label = cv_mode_label or ("grid_search" if use_grid else "kfold")

    return {
    "model_fr" : fr_model,
    "model_de" : de_model,
    "cv_mode": mode_label,
    "fr_cv": (fr_mean, fr_std),
    "de_cv": (de_mean, de_std),
    "spearman_fr_test": fr_test_score,
    "spearman_de_test": de_test_score,
    "spearman_global_test": spearman_global,
    "features_engineering": feature_engineering,
    "standardisation":standardisation
}

# MODELS

In [44]:
allowed_cols = [
    "model_fr",
    "model_de",
    "spearman_global_test",
    "spearman_fr_test",
    "spearman_de_test",
    "cv_mode",
    "features_engineering",
    "standardisation"
]

df_results = pd.DataFrame(columns=allowed_cols)

def display(results):
    for key, value in results.items():
        print(key, ":", value)

### Basic Model

The first step is to test the simpliest model with almost no feature engineering, to have a sort of reference model and to not considerate all the models less performant. In this first implementation the dataset isn't separate between France and Germany, all the columns are keep and there is no transformation on the columns. The model used is a linear regression.

In [45]:
X_all = df.drop(columns=["TARGET", "COUNTRY"]).fillna(0)
y_all = df["TARGET"]


X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test  = lr.predict(X_test)


print("Spearman train : {:.1f}%".format(100 * spearman_corr(y_train, y_pred_train)))
print("Spearman test  : {:.1f}%".format(100 * spearman_corr(y_test,  y_pred_test)))

Spearman train : 28.9%
Spearman test  : 19.5%


## Models with our Pipeline

### Linear Regression


In [46]:
res = pipeline_all(df, LinearRegression(), LinearRegression())
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}

model_fr : LinearRegression()
model_de : LinearRegression()
cv_mode : kfold
fr_cv : (np.float64(0.20356753106076667), np.float64(0.08232517437563475))
de_cv : (np.float64(0.24022632190504875), np.float64(0.12521555406740847))
spearman_fr_test : 0.1657655733347872
spearman_de_test : 0.392464221824687
spearman_global_test : 0.2709933537010029
features_engineering : True
standardisation : True


We can see an important improvement of our spearman score, with an improvement of 8% comparing to the reference model (from 19% to 27%). This justify our global strategy at least for Linear Regression.

### Polynomiale Regression 

In [47]:
poly_model_fr = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression())
])

poly_model_de = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression())
])

res = pipeline_all(df, poly_model_fr, poly_model_de)
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}

model_fr : Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)),
                ('lr', LinearRegression())])
model_de : Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)),
                ('lr', LinearRegression())])
cv_mode : kfold
fr_cv : (np.float64(0.08015539997634188), np.float64(0.09902778164029584))
de_cv : (np.float64(0.07672971424295011), np.float64(0.10003044840653669))
spearman_fr_test : 0.23626385164754035
spearman_de_test : 0.25490831842576034
spearman_global_test : 0.2508406763811741
features_engineering : True
standardisation : True


With polynomial regression, we keep improving our performance, however this model seems adapted only for the french dataset an hybrid model (polynomial regression for the french dataset and linear regression for the deutsh one)

#

### A simple hybrid model

In [48]:
res = pipeline_all(df, poly_model_fr, LinearRegression())
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}

model_fr : Pipeline(steps=[('poly', PolynomialFeatures(include_bias=False)),
                ('lr', LinearRegression())])
model_de : LinearRegression()
cv_mode : kfold
fr_cv : (np.float64(0.08015539997634188), np.float64(0.09902778164029584))
de_cv : (np.float64(0.24022632190504875), np.float64(0.12521555406740847))
spearman_fr_test : 0.23626385164754035
spearman_de_test : 0.392464221824687
spearman_global_test : 0.30924401793090445
features_engineering : True
standardisation : True


### Decision Tree Regressor 

In [49]:
spearman_score = make_scorer(spearman_corr, greater_is_better=True)
fr_param_grid = {
    "model__max_depth": [3, 4, 5, 7],
    "model__min_samples_leaf": [10, 20, 50],
    "model__min_samples_split": [10, 20, 30]
}

fr_base = Pipeline([("model", DecisionTreeRegressor(random_state=42))])

fr_search = GridSearchCV(
    estimator=fr_base,
    param_grid=fr_param_grid,
    scoring=spearman_score,
    cv=5,
    n_jobs=1,
    refit=True
)

de_param_grid = {
    "model__max_depth": [3, 4, 5, 7, 10, 15],
    "model__min_samples_leaf": [5, 10, 20, 30, 50],
    "model__min_samples_split": [5, 10, 20]
}

de_base = Pipeline([("model", DecisionTreeRegressor(random_state=42))])

de_search = GridSearchCV(
    estimator=de_base,
    param_grid=de_param_grid,
    scoring=spearman_score,
    cv=5,
    n_jobs=1,
    refit=True
)

res = pipeline_all(df, fr_model=fr_search, de_model=de_search, use_grid=True)
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}


model_fr : GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('model',
                                        DecisionTreeRegressor(random_state=42))]),
             n_jobs=1,
             param_grid={'model__max_depth': [3, 4, 5, 7],
                         'model__min_samples_leaf': [10, 20, 50],
                         'model__min_samples_split': [10, 20, 30]},
             scoring=make_scorer(spearman_corr, response_method='predict'))
model_de : GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('model',
                                        DecisionTreeRegressor(random_state=42))]),
             n_jobs=1,
             param_grid={'model__max_depth': [3, 4, 5, 7, 10, 15],
                         'model__min_samples_leaf': [5, 10, 20, 30, 50],
                         'model__min_samples_split': [5, 10, 20]},
             scoring=make_scorer(spearman_corr, response_method='predict'))
cv_mode : grid_search
fr_cv : (np.float64(0.07108673777153425), np.float64(0.0

#### Decision Tree (k-fold)


In [50]:
fr_tree_params = {"max_depth": 5, "min_samples_leaf": 5, "min_samples_split": 20}
de_tree_params = {"max_depth": 3, "min_samples_leaf": 20, "min_samples_split": 5}

fr_tree = DecisionTreeRegressor(random_state=42, **fr_tree_params)
de_tree = DecisionTreeRegressor(random_state=42, **de_tree_params)

res = pipeline_all(df, fr_tree, de_tree)
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}


model_fr : DecisionTreeRegressor(max_depth=5, min_samples_leaf=5, min_samples_split=20,
                      random_state=42)
model_de : DecisionTreeRegressor(max_depth=3, min_samples_leaf=20, min_samples_split=5,
                      random_state=42)
cv_mode : kfold
fr_cv : (np.float64(0.14971576303737627), np.float64(0.05351063373639738))
de_cv : (np.float64(0.20157967688667733), np.float64(0.1391080397964053))
spearman_fr_test : 0.0706679694385563
spearman_de_test : 0.22403112608507902
spearman_global_test : 0.14901607641289508
features_engineering : True
standardisation : True


Decision trees performed significantly worse than linear and polynomial models. Despite extensive hyperparameter, the models failed to capture stable relationships, showing high variance and bad generalisation. This suggests that the dataset does not exhibit strong hierarchical or rule-based patterns, and tree-based splits are overly sensitive to noise, especially for the French subset, which contains many extreme values.

#### Decision Tree with Bagging


In [51]:
fr_bagging_params = {
    "n_estimators": 30,
    "max_samples": 1.0,
    "max_features": 1.0,
    "bootstrap": False,
    "n_jobs": -1,
    "random_state": 42,
}

de_bagging_params = {
    "n_estimators": 100,
    "max_samples": 0.9,
    "max_features": 0.7,
    "bootstrap": False,
    "n_jobs": -1,
    "random_state": 42,
}

bagging_fr, bagging_de = build_bagging_decision_trees(
    fr_tree_params,
    de_tree_params,
    fr_bagging_params,
    de_bagging_params,
)

res = pipeline_all(df, bagging_fr, bagging_de, cv_mode_label="bagging")
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}


model_fr : BaggingRegressor(bootstrap=False,
                 estimator=DecisionTreeRegressor(max_depth=5,
                                                 min_samples_leaf=5,
                                                 min_samples_split=20,
                                                 random_state=42),
                 n_estimators=30, n_jobs=-1, random_state=42)
model_de : BaggingRegressor(bootstrap=False,
                 estimator=DecisionTreeRegressor(max_depth=3,
                                                 min_samples_leaf=20,
                                                 min_samples_split=5,
                                                 random_state=42),
                 max_features=0.7, max_samples=0.9, n_estimators=100, n_jobs=-1,
                 random_state=42)
cv_mode : bagging
fr_cv : (np.float64(0.14971576303737627), np.float64(0.05351063373639738))
de_cv : (np.float64(0.2098943397625738), np.float64(0.11103529895649739))
spearman_fr_test : 0.0706679

Although bagging reduced variance compared to standalone trees, performance remained inferior to simpler linear models. The improvement on the German dataset was low, but the French dataset remained fine. The results indicate that ensembling does not sufficiently stabilize trees when the underlying signal-to-noise ratio is low.

### Support Vector Regressor


In [52]:
svr_params = {"C": 10, "kernel": "linear", "gamma": "scale", "epsilon": 0.1}

svr_fr = SVR(**svr_params)
svr_de = SVR(**svr_params)

res = pipeline_all(df, svr_fr, svr_de)
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}


model_fr : SVR(C=10, kernel='linear')
model_de : SVR(C=10, kernel='linear')
cv_mode : kfold
fr_cv : (np.float64(0.21711805518813038), np.float64(0.10847272088092935))
de_cv : (np.float64(0.27390404160235177), np.float64(0.1278410551302806))
spearman_fr_test : 0.22184475834882816
spearman_de_test : 0.37359682468694105
spearman_global_test : 0.30793333611858115
features_engineering : True
standardisation : True


SVR handles outliers more robustly than for classical linear regression, which may explain the particularly strong performance on the German dataset.


### Random Forest Regressor


In [53]:
rf_fr = RandomForestRegressor(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=20,
    random_state=42,
    n_jobs=-1,
)

rf_de = RandomForestRegressor(
    n_estimators=400,
    max_depth=10,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1,
)

res = pipeline_all(df, rf_fr, rf_de)
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}


model_fr : RandomForestRegressor(max_depth=8, min_samples_leaf=20, n_estimators=300,
                      n_jobs=-1, random_state=42)
model_de : RandomForestRegressor(max_depth=10, min_samples_leaf=10, n_estimators=400,
                      n_jobs=-1, random_state=42)
cv_mode : kfold
fr_cv : (np.float64(0.14865402867054464), np.float64(0.07712237886188927))
de_cv : (np.float64(0.24473379313375196), np.float64(0.1083541617965652))
spearman_fr_test : 0.19730253996123978
spearman_de_test : 0.2264031753130591
spearman_global_test : 0.22525993633706437
features_engineering : True
standardisation : True


Random Forests did not outperform simpler regressors. Even if their ability to model complex interactions, they suffered from noise and lack of strong partition structures in the data.

### XGBoost Regressor


In [54]:
xgb_params = {
    "n_estimators": 200,
    "max_depth": 3,
    "learning_rate": 0.03,
    "subsample": 0.7,
    "colsample_bytree": 0.7,
    "reg_lambda": 2.0,
    "reg_alpha": 1.0,
    "objective": "reg:squarederror",
    "random_state": 42,
    "n_jobs": -1,
}

xgb_fr = XGBRegressor(**xgb_params)
xgb_de = XGBRegressor(**xgb_params)

res = pipeline_all(df, xgb_fr, xgb_de)
display(res)
df_results.loc[len(df_results)] = {key: res[key] for key in allowed_cols}


model_fr : XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.7, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.03, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=200,
             n_jobs=-1, num_parallel_tree=None, ...)
model_de : XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.7, device=None, early_stopping_rounds=None,
             enable_categorical=False, 

XGBoost performed slightly better than Random Forests. Even after parameter tuning, the gains still limited. This may indicates that boosting is unable to compensate for the limited structure and relatively low signal-to-noise ratio of the input features.

# Results

In [55]:
df_results.sort_values(by="spearman_global_test", ascending=False)

Unnamed: 0,model_fr,model_de,spearman_global_test,spearman_fr_test,spearman_de_test,cv_mode,features_engineering,standardisation
2,"(PolynomialFeatures(include_bias=False), Linea...",LinearRegression(),0.309244,0.236264,0.392464,kfold,True,True
6,"SVR(C=10, kernel='linear')","SVR(C=10, kernel='linear')",0.307933,0.221845,0.373597,kfold,True,True
0,LinearRegression(),LinearRegression(),0.270993,0.165766,0.392464,kfold,True,True
1,"(PolynomialFeatures(include_bias=False), Linea...","(PolynomialFeatures(include_bias=False), Linea...",0.250841,0.236264,0.254908,kfold,True,True
8,"XGBRegressor(base_score=None, booster=None, ca...","XGBRegressor(base_score=None, booster=None, ca...",0.225757,0.210939,0.221836,kfold,True,True
7,"(DecisionTreeRegressor(max_depth=8, max_featur...","(DecisionTreeRegressor(max_depth=10, max_featu...",0.22526,0.197303,0.226403,kfold,True,True
3,"GridSearchCV(cv=5,\n estimator=Pip...","GridSearchCV(cv=5,\n estimator=Pip...",0.189236,0.138723,0.224031,grid_search,True,True
5,"(DecisionTreeRegressor(max_depth=5, min_sample...","(DecisionTreeRegressor(max_depth=3, min_sample...",0.164329,0.070668,0.210689,bagging,True,True
4,"DecisionTreeRegressor(max_depth=5, min_samples...","DecisionTreeRegressor(max_depth=3, min_samples...",0.149016,0.070668,0.224031,kfold,True,True


We observe poor performance, especially for the German dataset. Essayer d'optimiser ça???