## Chargement des donn√©es


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')


X_train = pd.read_csv('data\X_train_78VdSWL.csv')
X_test = pd.read_csv('data/X_test_XkVc4no.csv')
Y_train = pd.read_csv('data/y_train_u0UkkEh.csv')

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")


X_train shape: (1057, 21001)
X_test shape: (1057, 38141)
Y_train shape: (1057, 1001)


## Feature Engineering



In [3]:
start_date = '2023-01-09 00:00:00'

# Cr√©er les index temporels
time_index_train = pd.date_range(
    start=start_date,
    periods=len(X_train),
    freq="30min"
)
time_index_test = pd.date_range(
    start=time_index_train[-1] + pd.Timedelta(minutes=30),
    periods=len(X_test),
    freq="30min"
)

X_train["datetime"] = time_index_train
X_test["datetime"] = time_index_test

# Features temporelles
for df in [X_train, X_test]:
    df["hour"] = df["datetime"].dt.hour + df["datetime"].dt.minute / 60
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    df["dow"] = df["datetime"].dt.dayofweek
    df["is_weekend"] = (df["dow"] >= 5).astype(int)
    df["month"] = df["datetime"].dt.month

# Supprimer colonnes non n√©cessaires
cols_to_drop = ["Horodate", "datetime", "hour", "dow"]
X_train.drop(columns=cols_to_drop, inplace=True, errors="ignore")
X_test.drop(columns=cols_to_drop, inplace=True, errors="ignore")

## Identification des clients √† pr√©dire

In [4]:

clients_to_predict = set(Y_train.columns)
feature_columns = [c for c in X_train.columns if c not in clients_to_predict]

print(f"\nNombre de features: {len(feature_columns)}")
print(f"Nombre de clients √† pr√©dire: {len(clients_to_predict)}")



Nombre de features: 20004
Nombre de clients √† pr√©dire: 1001


## Nettoyage et normalisation

In [5]:

# Remplir les NaN
X_train_clean = X_train.reindex(columns=feature_columns, fill_value=0)
X_test_clean = X_test.reindex(columns=feature_columns, fill_value=0)

# Supprimer colonnes constantes
stds = X_train_clean.std(axis=0)
non_constant_cols = stds[stds > 0].index
X_train_clean = X_train_clean[non_constant_cols]
X_test_clean = X_test_clean[non_constant_cols]

print(f"Features apr√®s nettoyage: {X_train_clean.shape[1]}")

# V√©rifier s'il y a des NaN
print(f"NaN dans X_train: {X_train_clean.isnan().sum().sum()}")
print(f"NaN dans X_test: {X_test_clean.isnan().sum().sum()}")

# Remplir Y_train
Y_train = Y_train.fillna(0)

# Standardisation
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train_clean)
X_test_scaled = scaler_X.transform(X_test_clean)

scaler_Y = StandardScaler()
Y_train_scaled = scaler_Y.fit_transform(Y_train)

Features apr√®s nettoyage: 19301


AttributeError: 'DataFrame' object has no attribute 'isnan'

## PCA

In [None]:
# Trouver le nombre optimal de composantes
pca_test = PCA()
pca_test.fit(X_train_scaled)
cumsum_var = np.cumsum(pca_test.explained_variance_ratio_)

# Choisir n_components pour 95% de variance
n_components = np.argmax(cumsum_var >= 0.95) + 1
n_components = min(n_components, 200)  # Limite max

print(f"\nPCA: {n_components} composantes pour 95% de variance")

pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Variance expliqu√©e: {pca.explained_variance_ratio_.sum():.4f}")


## Split TRAIN/VALIDATION

In [None]:

X_tr, X_val, Y_tr, Y_val = train_test_split(
    X_train_pca, Y_train_scaled, 
    test_size=0.2, 
    random_state=42
)

print(f"\nTrain set: {X_tr.shape}")
print(f"Validation set: {X_val.shape}")



## Entrainement KNN optimis√©

In [None]:

# Test de plusieurs valeurs de k
k_values = [3, 5, 7, 10, 15, 20]
best_k = None
best_score = float('inf')

print("\n" + "="*50)
print("RECHERCHE DU MEILLEUR K")
print("="*50)

for k in k_values:
    knn = KNeighborsRegressor(
        n_neighbors=k,
        weights='distance',  # Pond√©ration par distance
        metric='euclidean',
        n_jobs=-1
    )
    
    knn.fit(X_tr, Y_tr)
    Y_val_pred = knn.predict(X_val)
    
    mae = mean_absolute_error(Y_val, Y_val_pred)
    print(f"k={k:2d} -> MAE validation: {mae:.4f}")
    
    if mae < best_score:
        best_score = mae
        best_k = k

print(f"\nMeilleur k: {best_k} avec MAE={best_score:.4f}")


## Entrainement final

In [None]:


print("\n" + "="*50)
print("ENTRA√éNEMENT FINAL")
print("="*50)

knn_final = KNeighborsRegressor(
    n_neighbors=best_k,
    weights='distance',
    metric='euclidean',
    n_jobs=-1
)

knn_final.fit(X_train_pca, Y_train_scaled)

# Pr√©dictions
Y_train_pred_scaled = knn_final.predict(X_train_pca)
Y_train_pred = scaler_Y.inverse_transform(Y_train_pred_scaled)

Y_test_scaled = knn_final.predict(X_test_pca)
Y_test = scaler_Y.inverse_transform(Y_test_scaled)



## Evaluation d√©taill√©e

In [None]:

print("\n" + "="*50)
print("√âVALUATION SUR L'ENSEMBLE D'ENTRA√éNEMENT")
print("="*50)

# M√©triques globales
mae_train = mean_absolute_error(Y_train.values, Y_train_pred)
rmse_train = np.sqrt(mean_squared_error(Y_train.values, Y_train_pred))
r2_train = r2_score(Y_train.values, Y_train_pred)

print(f"MAE (train):  {mae_train:.2f}")
print(f"RMSE (train): {rmse_train:.2f}")
print(f"R¬≤ (train):   {r2_train:.4f}")

# M√©triques par client
print("\n" + "="*50)
print("M√âTRIQUES PAR CLIENT (top 10 meilleurs/pires)")
print("="*50)

client_metrics = []
for i, client in enumerate(Y_train.columns):
    mae_client = mean_absolute_error(Y_train.iloc[:, i], Y_train_pred[:, i])
    rmse_client = np.sqrt(mean_squared_error(Y_train.iloc[:, i], Y_train_pred[:, i]))
    r2_client = r2_score(Y_train.iloc[:, i], Y_train_pred[:, i])
    client_metrics.append({
        'client': client,
        'MAE': mae_client,
        'RMSE': rmse_client,
        'R2': r2_client
    })

df_metrics = pd.DataFrame(client_metrics)
df_metrics = df_metrics.sort_values('MAE')

print("\nüèÜ Top 10 meilleurs clients (MAE le plus faible):")
print(df_metrics.head(10).to_string(index=False))

print("\n‚ö†Ô∏è Top 10 pires clients (MAE le plus √©lev√©):")
print(df_metrics.tail(10).to_string(index=False))


## Validation crois√©e


In [None]:
print("\n" + "="*50)
print("VALIDATION CROIS√âE (K-FOLD)")
print("="*50)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Validation crois√©e sur le premier client comme exemple
client_0_data = Y_train.iloc[:, 0].values.reshape(-1, 1)
cv_scores = cross_val_score(
    knn_final, 
    X_train_pca, 
    client_0_data.ravel(),
    cv=kfold,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

print(f"CV MAE scores: {-cv_scores}")
print(f"CV MAE moyen: {-cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")


## Visualisation

In [None]:
print("\n" + "="*50)
print("G√âN√âRATION DES VISUALISATIONS")
print("="*50)

# Graphique 1: R√©el vs Pr√©dit pour un client
client_id = Y_train.columns[0]
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Train set
axes[0, 0].plot(Y_train[client_id].values, label='R√©el', linewidth=2, alpha=0.7)
axes[0, 0].plot(Y_train_pred[:, 0], label='Pr√©dit', linestyle='--', linewidth=2, alpha=0.7)
axes[0, 0].set_title(f'Client {client_id} ‚Äî R√©el vs Pr√©dit (TRAIN)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Temps (pas de 30 min)')
axes[0, 0].set_ylabel('Consommation')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Scatter plot
axes[0, 1].scatter(Y_train[client_id].values, Y_train_pred[:, 0], alpha=0.5, s=10)
axes[0, 1].plot([Y_train[client_id].min(), Y_train[client_id].max()], 
                [Y_train[client_id].min(), Y_train[client_id].max()], 
                'r--', linewidth=2)
axes[0, 1].set_title('Scatter: R√©el vs Pr√©dit', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Valeurs r√©elles')
axes[0, 1].set_ylabel('Valeurs pr√©dites')
axes[0, 1].grid(True, alpha=0.3)

# R√©sidus
residuals = Y_train[client_id].values - Y_train_pred[:, 0]
axes[1, 0].plot(residuals, linewidth=1, alpha=0.7)
axes[1, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1, 0].set_title('R√©sidus (R√©el - Pr√©dit)', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Temps (pas de 30 min)')
axes[1, 0].set_ylabel('R√©sidu')
axes[1, 0].grid(True, alpha=0.3)

# Distribution des r√©sidus
axes[1, 1].hist(residuals, bins=50, edgecolor='black', alpha=0.7)
axes[1, 1].axvline(x=0, color='r', linestyle='--', linewidth=2)
axes[1, 1].set_title('Distribution des r√©sidus', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('R√©sidu')
axes[1, 1].set_ylabel('Fr√©quence')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('evaluation_complete.png', dpi=300, bbox_inches='tight')
plt.show()

# Graphique 2: Distribution des MAE par client
plt.figure(figsize=(12, 6))
plt.hist(df_metrics['MAE'], bins=30, edgecolor='black', alpha=0.7)
plt.axvline(x=df_metrics['MAE'].mean(), color='r', linestyle='--', 
            linewidth=2, label=f'Moyenne: {df_metrics["MAE"].mean():.2f}')
plt.title('Distribution des MAE par client', fontsize=14, fontweight='bold')
plt.xlabel('MAE')
plt.ylabel('Nombre de clients')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('distribution_mae.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úÖ Visualisations sauvegard√©es!")

## Diagnostic overfitting

In [None]:
print("\n" + "="*50)
print("DIAGNOSTIC D'OVERFITTING")
print("="*50)

if r2_train > 0.99:
    print("‚ö†Ô∏è ATTENTION: R¬≤ tr√®s proche de 1 (overfitting probable)")
    print("Recommandations:")
    print("  - R√©duire le nombre de composantes PCA")
    print("  - Augmenter k dans KNN")
    print("  - Utiliser une r√©gularisation")
elif r2_train > 0.95:
    print("‚ö†Ô∏è R¬≤ √©lev√© mais acceptable (surveiller)")
else:
    print("‚úÖ R¬≤ dans une plage raisonnable")

print(f"\n√âcart MAE train vs validation: {abs(mae_train - best_score):.2f}")
if abs(mae_train - best_score) / mae_train > 0.2:
    print("‚ö†Ô∏è √âcart important entre train et validation (overfitting)")
else:
    print("‚úÖ √âcart acceptable entre train et validation")

## Export des pr√©dictions de tests

In [None]:
Y_test_df = pd.DataFrame(Y_test, columns=Y_train.columns)
Y_test_df.to_csv('predictions_test.csv', index=False)
print(f"\n‚úÖ Pr√©dictions export√©es dans 'predictions_test.csv'")
print(f"Shape: {Y_test_df.shape}")