In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
import numpy as np

# importations des datasets 
data_X_train = pd.read_csv("X_train.csv")
data_y_train = pd.read_csv("y_train.csv")
data_X_test = pd.read_csv("X_test_final.csv")
data_y_test = pd.read_csv("y_test_random_final.csv")

# préparation des données
X = data_X_train.drop(columns=["ID", "COUNTRY"]).copy()
y = data_y_train["TARGET"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)

# forêts aléatoires
model_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("randomforest", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Entraînement
model_pipeline.fit(X_train, y_train)

# Prédiction
y_pred = model_pipeline.predict(X_test)

# Évaluation
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Valeur RMSE avec Random Forest :", rmse)

corr, p_value = spearmanr(y_test, y_pred)
print(f"Corrélation de Spearman : {corr:.4f}")


Valeur RMSE avec Random Forest : 1.192702469134948
Corrélation de Spearman : 0.1577
