In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from scipy.stats import spearmanr


# importations des datasets kjgkjhg

data_X_train = pd.read_csv("X_train.csv")
data_y_train = pd.read_csv("y_train.csv")
data_X_test = pd.read_csv("X_test_final.csv")
data_y_test = pd.read_csv("y_test_random_final.csv")

X = data_X_train.drop(columns=["ID","COUNTRY"]).copy()
y=data_y_train["TARGET"].copy()

#séparation 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_columns = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_columns = X_train.select_dtypes(include=["object"]).columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)

model_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("lregression", LinearRegression())
])

model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)

# métriques 

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Valeur RMSE sans validation croisée :", rmse)
corr, p_value = spearmanr(y_test, y_pred)
print(f"Corrélation de Spearman : {corr:.4f}")

Valeur RMSE sans validation croisée : 1.0944930936351775
Corrélation de Spearman : 0.2007


In [36]:
scores = cross_val_score(model_pipeline, X_train, y_train,
                         cv=5,  
                         scoring="neg_root_mean_squared_error")  

rmse_scores = -scores
print("RMSE par fold :", rmse_scores)
print("Moyenne RMSE après validation croisée :", rmse_scores.mean())

y_pred_cv = cross_val_predict(model_pipeline, X_train, y_train, cv=5)

# Corrélation de Spearman entre les vraies valeurs et les prédictions croisées
corr, p_value = spearmanr(y_train, y_pred_cv)

print(f"Corrélation de Spearman en validation croisée : {corr:.4f}")

RMSE par fold : [1.13464494 1.00512516 0.97709084 0.9592716  0.99570123]
Moyenne RMSE après validation croisée : 1.0143667556253755
Corrélation de Spearman en validation croisée : 0.2095


In [None]:
# Prédiction sur le vrai jeu de test
y_pred_test_final = model_pipeline.predict(data_X_test)

corr_final, p_value_final = spearmanr(data_y_test["TARGET"], y_pred_test_final)

print(f"Corrélation de Spearman sur le test final : {corr_final:.4f}")
print(f"p-value : {p_value_final:.4f}")


Corrélation de Spearman sur le test final : -0.0135
p-value : 0.7304
