# Feature engineering

## Library importation

In [47]:
# Traitement de données
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
from typing import List
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV




## Download the dataset

In [48]:
X = pd.read_csv("X_train_NHkHMNU.csv")
y = pd.read_csv("y_train_ZAN5mwg.csv")

df = pd.concat([X, y], axis=1)

df = df.drop(df.columns[-2], axis=1)

## Remove columns that have -1 correlation

Some vairables have -1 correlation :
- `DE_NET_EXPORT` and `DE_NET_IMPORT`
- `FR_NET_EXPORT` and `FR_NET_IMPORT`
- `DE_FR_EXCHANGE` and `FR_DE_EXCHANGE`

Moreover they have the same correlation with the other variables. So keeping both variables doesn't add meaning full information. That is why we chose to drop one of the variables from each -1 correlation.

In [49]:
columns_name = ["DE_NET_IMPORT", "FR_NET_IMPORT", "DE_FR_EXCHANGE"]
for c in columns_name:
    df.drop(columns=c, inplace=True)

## Split the dataset

In [50]:
df_fr = df[df["COUNTRY"] == "FR"].copy()
df_de = df[df["COUNTRY"] == "DE"].copy()

## Change Nan Values from both dataset

In [51]:
# French dataset
numeric_cols_fr = df_fr.select_dtypes(include=["number"]).columns
df_fr[numeric_cols_fr] = df_fr[numeric_cols_fr].fillna(df_fr[numeric_cols_fr].median())

# German dataset
numeric_cols_de = df_de.select_dtypes(include=["number"]).columns
df_de[numeric_cols_de] = df_de[numeric_cols_de].fillna(df_de[numeric_cols_de].median())

## Create additionnal columns that represents a Threshold

##### Seuils pour df_fr
- COAL_RET < 0.8
- FR_CONSUMPTION > 1.5
- FR_NUCLEAR < -1.8
- FR_HYDRO < -0.4

##### Seuils pour df_de
- DE_CONSUMPTION > 1.2
- DE_NET_EXPORT > -0.45
- DE_WINDPOW > 0.3


Transformation "ReLU"

In [52]:
def AddSeuilColumn(df: pd.DataFrame, column_name: str, seuil: float, way: str):
    message = column_name + "_THRESHOLD_" + str(seuil)
    if way == "sup":
        df[message] = df[column_name].where(df[column_name] >= seuil, 0)
    else:
        df[message] = df[column_name].where(df[column_name] <= seuil, 0)

threshold_fr = {"COAL_RET": [0.8, "inf"],
                "FR_CONSUMPTION": [1.5, "sup"],
                "FR_NUCLEAR": [-1.8, "inf"],
                "FR_HYDRO":[-0.4, "inf"]                
                }

threshold_de = {"DE_CONSUMPTION": [1.2, "sup"],
                "DE_NET_EXPORT": [-0.45, "sup"],
                "DE_WINDPOW": [0.3, "sup"]
}

# add threshold columns to the french dataset
for key, value in threshold_fr.items():
    AddSeuilColumn(df_fr, key, value[0], value[1])

# add threshold columns to the german dataset
for key, value in threshold_de.items():
    AddSeuilColumn(df_de, key, value[0], value[1])

In [53]:
df_de

Unnamed: 0,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_GAS,FR_GAS,DE_COAL,...,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET,DE_CONSUMPTION_THRESHOLD_1.2,DE_NET_EXPORT_THRESHOLD_-0.45,DE_WINDPOW_THRESHOLD_0.3
3,720,DE,-0.983324,-0.849198,0.839586,-0.270870,0.563230,0.487818,0.194659,-1.473817,...,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948,-0.260356,0.000000,-0.270870,0.000000
11,116,DE,-0.055692,-0.811337,-0.237105,-0.851082,-1.091142,0.882313,-0.145637,-0.042992,...,1.777992,1.595158,0.158463,-0.359866,-0.203952,-0.376234,-0.133381,0.000000,0.000000,0.000000
12,406,DE,0.532116,-0.331101,-0.339942,-0.173123,-1.312029,-0.188430,1.382599,-0.354327,...,0.100498,1.241892,-0.206340,1.170760,0.133643,0.033874,0.196312,0.000000,-0.173123,1.448078
13,1175,DE,-0.328286,-1.062255,1.380464,-1.046122,1.002243,0.544008,-0.730992,-1.183566,...,-0.695013,-0.634046,-0.168491,0.122818,0.220077,5.453331,-0.025477,0.000000,0.000000,0.359210
14,309,DE,1.028987,1.629315,-1.129663,-0.391261,-1.823117,2.170761,2.122272,1.831623,...,-0.945562,-0.667496,-1.566773,0.689483,1.095473,0.342798,0.460278,0.000000,-0.391261,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,78,DE,0.810509,-0.235078,-0.962203,0.971934,-0.121857,0.187811,0.219593,-0.605620,...,-0.456403,1.400533,1.512197,0.793561,0.848558,0.517578,-0.015261,0.000000,0.971934,1.619323
1488,87,DE,-0.295522,-1.051247,1.651718,-1.969871,0.074044,1.172127,0.076716,-0.700021,...,-0.564584,1.120555,0.865314,-0.423802,0.067325,1.558886,0.110206,0.000000,0.000000,0.000000
1489,809,DE,1.529204,1.106682,1.855327,-0.218658,1.450426,1.810665,1.388269,0.359723,...,-0.050781,-0.035360,-0.032517,0.876984,0.819520,1.320373,-0.172597,1.529204,-0.218658,0.000000
1491,1083,DE,0.856399,0.489199,0.255778,-1.531544,-0.829568,2.108764,1.866399,1.072553,...,0.894011,0.256338,0.402316,-1.112899,-0.237835,0.067152,0.151797,0.000000,0.000000,0.000000


In [54]:
df_de

Unnamed: 0,DAY_ID,COUNTRY,DE_CONSUMPTION,FR_CONSUMPTION,FR_DE_EXCHANGE,DE_NET_EXPORT,FR_NET_EXPORT,DE_GAS,FR_GAS,DE_COAL,...,FR_WIND,DE_TEMP,FR_TEMP,GAS_RET,COAL_RET,CARBON_RET,TARGET,DE_CONSUMPTION_THRESHOLD_1.2,DE_NET_EXPORT_THRESHOLD_-0.45,DE_WINDPOW_THRESHOLD_0.3
3,720,DE,-0.983324,-0.849198,0.839586,-0.270870,0.563230,0.487818,0.194659,-1.473817,...,-0.236249,0.350938,-0.417514,0.911652,-0.296168,1.073948,-0.260356,0.000000,-0.270870,0.000000
11,116,DE,-0.055692,-0.811337,-0.237105,-0.851082,-1.091142,0.882313,-0.145637,-0.042992,...,1.777992,1.595158,0.158463,-0.359866,-0.203952,-0.376234,-0.133381,0.000000,0.000000,0.000000
12,406,DE,0.532116,-0.331101,-0.339942,-0.173123,-1.312029,-0.188430,1.382599,-0.354327,...,0.100498,1.241892,-0.206340,1.170760,0.133643,0.033874,0.196312,0.000000,-0.173123,1.448078
13,1175,DE,-0.328286,-1.062255,1.380464,-1.046122,1.002243,0.544008,-0.730992,-1.183566,...,-0.695013,-0.634046,-0.168491,0.122818,0.220077,5.453331,-0.025477,0.000000,0.000000,0.359210
14,309,DE,1.028987,1.629315,-1.129663,-0.391261,-1.823117,2.170761,2.122272,1.831623,...,-0.945562,-0.667496,-1.566773,0.689483,1.095473,0.342798,0.460278,0.000000,-0.391261,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,78,DE,0.810509,-0.235078,-0.962203,0.971934,-0.121857,0.187811,0.219593,-0.605620,...,-0.456403,1.400533,1.512197,0.793561,0.848558,0.517578,-0.015261,0.000000,0.971934,1.619323
1488,87,DE,-0.295522,-1.051247,1.651718,-1.969871,0.074044,1.172127,0.076716,-0.700021,...,-0.564584,1.120555,0.865314,-0.423802,0.067325,1.558886,0.110206,0.000000,0.000000,0.000000
1489,809,DE,1.529204,1.106682,1.855327,-0.218658,1.450426,1.810665,1.388269,0.359723,...,-0.050781,-0.035360,-0.032517,0.876984,0.819520,1.320373,-0.172597,1.529204,-0.218658,0.000000
1491,1083,DE,0.856399,0.489199,0.255778,-1.531544,-0.829568,2.108764,1.866399,1.072553,...,0.894011,0.256338,0.402316,-1.112899,-0.237835,0.067152,0.151797,0.000000,0.000000,0.000000


## Remove Columns that have a low correlation with the TARGET variable

In [55]:
# COLONNES RECUPEREES TEMPORAIREMENT A LA MAIN CAR SEPARATIONN DES FICHIERS ANALYSES ET ENGINEERING
# A RECUPER DES VARIBALES QUAND LE RASSEMBLEMENT DES FICHIERS SERA FAIT

columns_keep_fr = ["DE_NET_EXPORT",
                "DE_HYDRO",
                "DE_WINDPOW",
                "FR_WINDPOW",
                "GAS_RET",
                "CARBON_RET",
                "TARGET"]

columns_keep_de = ["DE_NET_EXPORT",
                   "DE_GAS",
                   "DE_COAL",
                   "DE_HYDRO",
                   "DE_WINDPOW",
                   "FR_WINDPOW",
                   "DE_LIGNITE",
                   "DE_RESIDUAL_LOAD",
                   "DE_WIND",
                   "TARGET"]

# drop columns that are not in thoses lists
# french
for c in df_fr.columns:
    if c not in columns_keep_fr and "_THRESHOLD_" not in c:
        df_fr.drop(columns=c, inplace=True)

#german
for c in df_de.columns:
    if c not in columns_keep_de and "_THRESHOLD_" not in c:
        df_de.drop(columns=c, inplace=True)

In [56]:
df_fr

Unnamed: 0,DE_NET_EXPORT,DE_HYDRO,DE_WINDPOW,FR_WINDPOW,GAS_RET,CARBON_RET,TARGET,COAL_RET_THRESHOLD_0.8,FR_CONSUMPTION_THRESHOLD_1.5,FR_NUCLEAR_THRESHOLD_-1.8,FR_HYDRO_THRESHOLD_-0.4
0,-0.244606,2.209047,-0.573370,-0.269460,0.339041,-0.002445,0.028313,0.124552,0.000000,0.000000,0.000000
1,-0.573520,0.187964,-0.035514,-0.107350,-0.659091,-0.490365,-0.112516,0.047114,0.000000,-2.185961,-0.807112
2,-0.622021,-0.108578,-0.298755,-0.141239,0.535974,0.204952,-0.180840,0.743338,1.978665,0.000000,0.000000
4,-0.244606,-0.230179,-0.774941,-0.564498,0.245109,2.614378,-0.071733,0.000000,0.000000,0.000000,-0.795983
5,-1.117139,2.306980,-0.977976,-0.245628,0.891049,1.124457,0.932105,0.000000,0.000000,-1.920695,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
1483,-0.977214,1.781299,-0.578489,0.149270,1.946355,1.322433,0.108277,0.000000,0.000000,0.000000,0.000000
1486,1.403843,-0.385397,2.285474,1.646472,0.494188,1.472650,0.827636,0.000000,0.000000,0.000000,0.000000
1487,0.101161,-0.315249,-0.446788,0.536297,1.145686,0.606318,0.049618,0.335645,0.000000,0.000000,0.000000
1490,0.449153,-0.341147,0.198857,0.789618,0.932633,0.356356,-0.063546,-0.085690,1.752840,0.000000,0.000000


In [57]:
df_de

Unnamed: 0,DE_NET_EXPORT,DE_GAS,DE_COAL,DE_HYDRO,DE_WINDPOW,FR_WINDPOW,DE_LIGNITE,DE_RESIDUAL_LOAD,DE_WIND,TARGET,DE_CONSUMPTION_THRESHOLD_1.2,DE_NET_EXPORT_THRESHOLD_-0.45,DE_WINDPOW_THRESHOLD_0.3
3,-0.270870,0.487818,-1.473817,-0.368417,-0.010090,0.366885,-2.330557,-1.191889,-0.499409,-0.260356,0.000000,-0.270870,0.000000
11,-0.851082,0.882313,-0.042992,1.282374,-0.103994,0.356181,-0.354480,-0.178397,1.143607,-0.133381,0.000000,0.000000,0.000000
12,-0.173123,-0.188430,-0.354327,-0.168264,1.448078,0.079753,-0.237658,-0.795593,1.383171,0.196312,0.000000,-0.173123,1.448078
13,-1.046122,0.544008,-1.183566,0.523610,0.359210,-0.304661,-2.244028,-0.676137,-0.138918,-0.025477,0.000000,0.000000,0.359210
14,-0.391261,2.170761,1.831623,1.223032,-1.210165,0.326872,0.892261,2.156285,-1.106067,0.460278,0.000000,-0.391261,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1485,0.971934,0.187811,-0.605620,0.128621,1.619323,0.386094,0.103651,-0.696438,-0.459455,-0.015261,0.000000,0.971934,1.619323
1488,-1.969871,1.172127,-0.700021,2.053830,-1.010181,-0.657622,-0.445943,0.604003,-0.721643,0.110206,0.000000,0.000000,0.000000
1489,-0.218658,1.810665,0.359723,-0.470809,-0.057214,2.968535,0.790548,1.547782,-0.098259,-0.172597,1.529204,-0.218658,0.000000
1491,-1.531544,2.108764,1.072553,0.037892,-0.981718,0.303324,0.211422,1.493870,-0.594595,0.151797,0.000000,0.000000,0.000000


**QUESTIONS**

FAIRE UN PCA POUR LES VARIABLES NON SEUIL ?? J'ai tester faire un PCA n'améliore pas vraiment les performances de notre modèle

## Modèle de base

In [58]:
X_all = df.drop(columns=["TARGET", "COUNTRY"]).fillna(0)
y_all = df["TARGET"]


X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test  = lr.predict(X_test)

def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation

print("Corrélation (Spearman) train : {:.1f}%".format(100 * spearman_corr(y_train, y_pred_train)))
print("Corrélation (Spearman) test  : {:.1f}%".format(100 * spearman_corr(y_test,  y_pred_test)))

Corrélation (Spearman) train : 28.9%
Corrélation (Spearman) test  : 19.5%


#### Modèle sur df_fr (En utilant le df contenant les colonnes from PCA plus colonnes seuils)

Et regression pour le moment


In [59]:
X_all_fr = df_fr.drop(columns=["TARGET"])
y_all_fr = df_fr["TARGET"]


X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(X_all_fr, y_all_fr, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train_fr, y_train_fr)

y_pred_train_fr = lr.predict(X_train_fr)
y_pred_test_fr  = lr.predict(X_test_fr)

def spearman_corr(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation

print("Corrélation (Spearman) train : {:.1f}%".format(100 * spearman_corr(y_train_fr, y_pred_train_fr)))
print("Corrélation (Spearman) test  : {:.1f}%".format(100 * spearman_corr(y_test_fr,  y_pred_test_fr)))

Corrélation (Spearman) train : 21.0%
Corrélation (Spearman) test  : 14.9%


#

#### Modèle sur df_de (En utilant le de contenant les colonnes from PCA plus colonnes seuils)

Regression aussi

In [60]:
X_all_de = df_de.drop(columns=["TARGET"])
y_all_de = df_de["TARGET"]


X_train_de, X_test_de, y_train_de, y_test_de = train_test_split(X_all_de, y_all_de, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train_de, y_train_de)

y_pred_train_de = lr.predict(X_train_de)
y_pred_test_de  = lr.predict(X_test_de)

print("Corrélation (Spearman) train : {:.1f}%".format(100 * spearman_corr(y_train_de, y_pred_train_de)))
print("Corrélation (Spearman) test  : {:.1f}%".format(100 * spearman_corr(y_test_de,  y_pred_test_de)))

Corrélation (Spearman) train : 38.0%
Corrélation (Spearman) test  : 38.4%


In [61]:
y_test_global = np.concatenate([y_test_fr, y_test_de])
y_pred_global = np.concatenate([y_pred_test_fr, y_pred_test_de])

corr_global = spearman_corr(y_test_global, y_pred_global)

print("Corrélation (Spearman) globale : {:.1f}%".format(100 * corr_global))

Corrélation (Spearman) globale : 26.0%


#### Assemblage des deux modèles et train/test comme le modèle de base pour comparer 
Il faudra penser, si on a le temps à faire du k-fold pour éviter l'overfitting, genre on divise en 5 morceau et à chaques fois on change les morceaux qui entrainent et qui test et on voit si le modèle généralise bien.

#### Autres modèles à faire ensuite

### Polynomiale Regression (vue lab2)

In [62]:
X_all_fr = df_fr.drop(columns=["TARGET"])
y_all_fr = df_fr["TARGET"]

X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(
    X_all_fr, y_all_fr, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_fr)
X_test_poly  = poly.transform(X_test_fr)

lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train_fr)

y_pred_train_poly_fr = lr_poly.predict(X_train_poly)
y_pred_test_poly_fr  = lr_poly.predict(X_test_poly)

# Scores Spearman
print("Corrélation (Spearman) train (poly) : {:.1f}%".format(
    100 * spearman_corr(y_train_fr, y_pred_train_poly_fr)))
print("Corrélation (Spearman) test (poly)  : {:.1f}%".format(
    100 * spearman_corr(y_test_fr,  y_pred_test_poly_fr)
))


Corrélation (Spearman) train (poly) : 24.9%
Corrélation (Spearman) test (poly)  : 17.3%


In [63]:
X_all_de = df_de.drop(columns=["TARGET"])
y_all_de = df_de["TARGET"]

X_train_de, X_test_de, y_train_de, y_test_de = train_test_split(
    X_all_de, y_all_de, test_size=0.2, random_state=42
)

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_de)
X_test_poly  = poly.transform(X_test_de)

lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train_de)

y_pred_train_poly_de = lr_poly.predict(X_train_poly)
y_pred_test_poly_de  = lr_poly.predict(X_test_poly)

# Scores Spearman
print("Corrélation (Spearman) train (poly) : {:.1f}%".format(
    100 * spearman_corr(y_train_de, y_pred_train_poly_de)
))

print("Corrélation (Spearman) test (poly) : {:.1f}%".format(
    100 * spearman_corr(y_test_de, y_pred_test_poly_de)
))


Corrélation (Spearman) train (poly) : 49.1%
Corrélation (Spearman) test (poly) : 32.9%


In [64]:
y_test_global = np.concatenate([y_test_fr, y_test_de])
y_pred_global = np.concatenate([y_pred_test_poly_fr, y_pred_test_poly_de])

corr_global = spearman_corr(y_test_global, y_pred_global)

print("Corrélation (Spearman) globale : {:.1f}%".format(100 * corr_global))

Corrélation (Spearman) globale : 25.5%


### Decision Tree Regressor (vue lab2)

In [65]:
X_all_fr = df_fr.drop(columns=["TARGET"])
y_all_fr = df_fr["TARGET"]

X_train_fr, X_test_fr, y_train_fr, y_testfr = train_test_split(
    X_all_fr, y_all_fr, test_size=0.2, random_state=42
)

tree = DecisionTreeRegressor(
    max_depth=8,
    min_samples_leaf=20,
    random_state=42
)
tree.fit(X_train_fr, y_train_fr)

y_pred_train_tree_fr = tree.predict(X_train_fr)
y_pred_test_tree_fr  = tree.predict(X_test_fr)


print("Corrélation (Spearman) train (tree) : {:.1f}%".format(
    100 * spearman_corr(y_train_fr, y_pred_train_tree_fr)
))
print("Corrélation (Spearman) test (tree)  : {:.1f}%".format(
    100 * spearman_corr(y_test_fr,  y_pred_test_tree_fr)
))


Corrélation (Spearman) train (tree) : 34.3%
Corrélation (Spearman) test (tree)  : 16.6%


In [66]:



X_all_fr = df_fr.drop(columns=["TARGET"])
y_all_fr = df_fr["TARGET"]

X_train_fr, X_test_fr, y_train_fr, y_test_fr = train_test_split(
    X_all_fr, y_all_fr, test_size=0.2, random_state=42
)

param_grid = {
    "max_depth": [3, 5, 8, 10, None],
    "min_samples_leaf": [1, 5, 10, 20, 50, 70],
    "min_samples_split": [2, 5, 10, 20]
}

grid = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid,
    cv=3,
    scoring="neg_mean_squared_error",
    refit=True,
    verbose=1
)

grid.fit(X_train_fr, y_train_fr)

print("Best parameters:", grid.best_params_)
best_tree = grid.best_estimator_

y_pred_train_tree_fr = best_tree.predict(X_train_fr)
y_pred_test_tree_fr  = best_tree.predict(X_test_fr)

print("Corrélation (Spearman) train (tree) optimisé : {:.1f}%".format(
    100 * spearman_corr(y_train_fr, y_pred_train_tree_fr)
))
print("Corrélation (Spearman) test (tree) optimisé  : {:.1f}%".format(
    100 * spearman_corr(y_test_fr,  y_pred_test_tree_fr)
))


Fitting 3 folds for each of 120 candidates, totalling 360 fits
Best parameters: {'max_depth': 3, 'min_samples_leaf': 70, 'min_samples_split': 2}
Corrélation (Spearman) train (tree) optimisé : 21.9%
Corrélation (Spearman) test (tree) optimisé  : 14.5%


In [67]:
X_all_de = df_de.drop(columns=["TARGET"])
y_all_de = df_de["TARGET"]

X_train_de, X_test_de, y_train_de, y_test_de = train_test_split(
    X_all_de, y_all_de, test_size=0.2, random_state=42
)

tree = DecisionTreeRegressor(
    max_depth=8,
    min_samples_leaf=20,
    random_state=42
)
tree.fit(X_train_de, y_train_de)

y_pred_train_tree_de = tree.predict(X_train_de)
y_pred_test_tree_de  = tree.predict(X_test_de)


print("Corrélation (Spearman) train (tree) : {:.1f}%".format(
    100 * spearman_corr(y_train_de, y_pred_train_tree_de)
))
print("Corrélation (Spearman) test (tree)  : {:.1f}%".format(
    100 * spearman_corr(y_test_de,  y_pred_test_tree_de)
))

Corrélation (Spearman) train (tree) : 47.6%
Corrélation (Spearman) test (tree)  : 26.5%


Moins performant que les autres modèles

In [68]:
y_test_global = np.concatenate([y_test_fr, y_test_de])
y_pred_global = np.concatenate([y_pred_test_tree_fr, y_pred_test_tree_de])

corr_global = spearman_corr(y_test_global, y_pred_global)

print("Corrélation (Spearman) globale : {:.1f}%".format(100 * corr_global))

Corrélation (Spearman) globale : 19.5%
