# KNN Baseline – Data Challenge ENEDIS

Baseline KNN avec PCA pour prédire Y_train à partir de X_train.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error


In [None]:

# Chargement des données
x_train = pd.read_csv("data/X_train_78VdSWL.csv")
y_train = pd.read_csv("data/y_train_u0UkKEh.csv")

# Conversion Horodate
x_train["Horodate"] = pd.to_datetime(x_train["Horodate"])
y_train["Horodate"] = pd.to_datetime(y_train["Horodate"])

# Vérification alignement temporel
assert (x_train["Horodate"].values == y_train["Horodate"].values).all()
print("Horodate alignée")


In [None]:

# Features temporelles
x_train["hour"] = x_train["Horodate"].dt.hour
x_train["weekday"] = x_train["Horodate"].dt.weekday
x_train["hour_sin"] = np.sin(2*np.pi*x_train["hour"]/24)
x_train["hour_cos"] = np.cos(2*np.pi*x_train["hour"]/24)

# Suppression Horodate
x_train = x_train.drop(columns=["Horodate"])
y_train = y_train.drop(columns=["Horodate"])


In [None]:

# Gestion des NaN
x_train["nb_missing"] = x_train.isnull().sum(axis=1)
x_train = x_train.fillna(0)

print(f"X shape : {x_train.shape}")
print(f"Y shape : {y_train.shape}")


In [None]:

# Split temporel
split_idx = int(0.8 * len(x_train))

X_tr = x_train.iloc[:split_idx]
X_val = x_train.iloc[split_idx:]

Y_tr = y_train.iloc[:split_idx]
Y_val = y_train.iloc[split_idx:]


In [None]:

# Standardisation
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)


In [None]:

# PCA
pca = PCA(n_components=50, random_state=42)
X_tr_pca = pca.fit_transform(X_tr_scaled)
X_val_pca = pca.transform(X_val_scaled)

print(f"Variance expliquée : {pca.explained_variance_ratio_.sum()*100:.2f}%")


In [None]:

# Modèle KNN
knn = KNeighborsRegressor(
    n_neighbors=5,
    weights="distance",
    metric="euclidean",
    n_jobs=-1
)

knn.fit(X_tr_pca, Y_tr)


In [None]:

# Prédiction
Y_pred = knn.predict(X_val_pca)

rmse = mean_squared_error(
    Y_val.values.flatten(),
    Y_pred.flatten(),
    squared=False
)

print(f"RMSE globale KNN : {rmse:.4f}")


In [None]:

# Visualisation client exemple
client_id = 0

plt.figure(figsize=(10,4))
plt.plot(Y_val.iloc[:200, client_id].values, label="True")
plt.plot(Y_pred[:200, client_id], label="Pred")
plt.legend()
plt.title(f"KNN – Client holed_{client_id+1}")
plt.show()
