# <span style="color: orange;">Mod√©lisation - Pr√©diction du Taux de Grippe</span>

---

### <span style="color: green;">Introduction</span>

**Objectif** : Pr√©dire le taux de grippe pour 100 000 habitants par r√©gion fran√ßaise pour des semaines sp√©cifiques.

**Structure des donn√©es** : Donn√©es de panel (r√©gions √ó semaines) n√©cessitant un traitement sp√©cifique pour le d√©coupage train/validation.

**Approche** : Du plus simple au plus complexe
1. Mod√®les na√Øfs (baseline)
2. R√©gression lin√©aire adapt√©e aux panels
3. Mod√®les ARIMA
4. Mod√®les ML (Prophet, Random Forest, XGBoost)

---

## <span style="color: green;">1. Import des Librairies</span>

In [None]:
# Librairies de base
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit

# Mod√®les ML
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# XGBoost & LightGBM
import xgboost as xgb
import lightgbm as lgb

# S√©ries temporelles
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller, acf, pacf

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
plt.style.use('seaborn-v0_8-whitegrid')
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Librairies import√©es avec succ√®s !")

## <span style="color: green;">2. Chargement et Pr√©paration des Donn√©es</span>

In [None]:
# Chargement du dataset
df = pd.read_csv('train_final.csv')

print(f"üìä Dimensions initiales : {df.shape[0]} lignes √ó {df.shape[1]} colonnes")
print(f"üìÖ P√©riode : {df['week'].min()} - {df['week'].max()}")
print(f"üó∫Ô∏è  R√©gions : {df['region_name'].nunique()}")
df.head()

In [None]:
# Conversion des dates
df['week_date'] = pd.to_datetime(df['week_date'])
df['month_date'] = pd.to_datetime(df['month_date'])

# Extraction de l'ann√©e et du num√©ro de semaine
df['year'] = df['year'].astype(int)
df['week_num'] = df['week'].astype(str).str[-2:].astype(int)

# Tri par r√©gion et par date (CRUCIAL pour les donn√©es de panel)
df = df.sort_values(['region_name', 'week_date']).reset_index(drop=True)

print("‚úÖ Donn√©es tri√©es par r√©gion et par date")
df[['region_name', 'week', 'week_date', 'year', 'week_num', 'TauxGrippe']].head(10)

## <span style="color: green;">3. Feature Engineering</span>

### 3.1 Cr√©ation des variables temporelles

In [None]:
# Variables temporelles additionnelles
df['month_num'] = df['week_date'].dt.month
df['quarter'] = df['week_date'].dt.quarter
df['day_of_year'] = df['week_date'].dt.dayofyear

# Indicateur de saison grippale (octobre √† mars)
df['saison_grippe'] = df['month_num'].apply(lambda x: 1 if x in [10, 11, 12, 1, 2, 3] else 0)

# Variables cycliques pour capturer la saisonnalit√©
df['sin_week'] = np.sin(2 * np.pi * df['week_num'] / 52)
df['cos_week'] = np.cos(2 * np.pi * df['week_num'] / 52)
df['sin_month'] = np.sin(2 * np.pi * df['month_num'] / 12)
df['cos_month'] = np.cos(2 * np.pi * df['month_num'] / 12)

print("‚úÖ Variables temporelles cr√©√©es")
df[['week_date', 'month_num', 'quarter', 'saison_grippe', 'sin_week', 'cos_week']].head()

### 3.2 Cr√©ation des Lags (Variables Retard√©es)

In [None]:
# Cr√©ation des lags PAR R√âGION (tr√®s important pour les donn√©es de panel)
def create_lags(data, column, lags, group_col='region_name'):
    """
    Cr√©e des variables retard√©es (lags) pour une colonne donn√©e, group√©es par r√©gion.
    """
    df_temp = data.copy()
    for lag in lags:
        df_temp[f'{column}_lag{lag}'] = df_temp.groupby(group_col)[column].shift(lag)
    return df_temp

# Lags pour TauxGrippe (t-1, t-2, t-3, t-4 semaines)
lags_to_create = [1, 2, 3, 4]
df = create_lags(df, 'TauxGrippe', lags_to_create)

# Lags pour les requ√™tes Google
df = create_lags(df, 'requete_grippe', [1, 2])

print("‚úÖ Variables de lag cr√©√©es")
lag_cols = [col for col in df.columns if 'lag' in col]
print(f"   Colonnes de lag : {lag_cols}")

In [None]:
# Moyennes mobiles par r√©gion
def create_rolling_features(data, column, windows, group_col='region_name'):
    """
    Cr√©e des moyennes mobiles pour une colonne donn√©e, group√©es par r√©gion.
    """
    df_temp = data.copy()
    for window in windows:
        df_temp[f'{column}_rolling_mean_{window}'] = df_temp.groupby(group_col)[column].transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=1).mean()
        )
        df_temp[f'{column}_rolling_std_{window}'] = df_temp.groupby(group_col)[column].transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=1).std()
        )
    return df_temp

# Moyennes mobiles sur 4 et 8 semaines
df = create_rolling_features(df, 'TauxGrippe', [4, 8])

print("‚úÖ Moyennes mobiles cr√©√©es")
rolling_cols = [col for col in df.columns if 'rolling' in col]
print(f"   Colonnes rolling : {rolling_cols}")

### 3.3 Encodage des R√©gions (Dummies)

In [None]:
# Cr√©ation des dummies pour les r√©gions
region_dummies = pd.get_dummies(df['region_name'], prefix='region', drop_first=True)
df = pd.concat([df, region_dummies], axis=1)

print(f"‚úÖ Dummies r√©gions cr√©√©es : {region_dummies.shape[1]} colonnes")
print(f"   R√©gions encod√©es : {list(region_dummies.columns)[:5]}...")

In [None]:
# Label Encoding pour les mod√®les qui ne supportent pas les dummies
le_region = LabelEncoder()
df['region_encoded'] = le_region.fit_transform(df['region_name'])

# Mapping pour r√©f√©rence
region_mapping = dict(zip(le_region.classes_, le_region.transform(le_region.classes_)))
print("‚úÖ Label Encoding des r√©gions")
print(f"   Exemple : ALSACE ‚Üí {region_mapping.get('ALSACE', 'N/A')}")

### 3.4 Ratios d√©mographiques

In [None]:
# Cr√©ation de ratios pour √©viter la multicollin√©arit√©
df['ratio_jeunes'] = df['pop_0_19'] / df['pop_total']
df['ratio_seniors'] = df['pop_75_plus'] / df['pop_total']
df['ratio_actifs'] = (df['pop_20_39'] + df['pop_40_59']) / df['pop_total']

print("‚úÖ Ratios d√©mographiques cr√©√©s")
df[['region_name', 'pop_total', 'ratio_jeunes', 'ratio_seniors', 'ratio_actifs']].head()

In [None]:
# V√©rification des valeurs manquantes apr√®s feature engineering
missing_after_fe = df.isnull().sum()
missing_cols = missing_after_fe[missing_after_fe > 0]

print("üìä Valeurs manquantes apr√®s Feature Engineering :")
if len(missing_cols) > 0:
    for col, val in missing_cols.items():
        print(f"   {col}: {val} ({val/len(df)*100:.1f}%)")
else:
    print("   Aucune valeur manquante")

In [None]:
# Aper√ßu du dataset final
print(f"\nüìä Dimensions apr√®s Feature Engineering : {df.shape[0]} lignes √ó {df.shape[1]} colonnes")
print(f"\nüìã Liste des colonnes :")
for i, col in enumerate(df.columns, 1):
    print(f"   {i:2d}. {col}")

## <span style="color: green;">4. D√©coupage Train / Validation (Temporel)</span>

**Important** : Pour les donn√©es de panel temporelles, on ne fait PAS de split al√©atoire. On garde l'ordre chronologique.

In [None]:
# D√©coupage temporel : 80% train, 20% validation
# On prend les derni√®res semaines pour la validation

# Identifier les semaines uniques tri√©es
unique_weeks = df['week'].sort_values().unique()
n_weeks = len(unique_weeks)

# Point de coupure (80% des semaines pour l'entra√Ænement)
split_idx = int(n_weeks * 0.8)
train_weeks = unique_weeks[:split_idx]
val_weeks = unique_weeks[split_idx:]

print(f"üìä D√âCOUPAGE TEMPOREL")
print(f"   Semaines totales : {n_weeks}")
print(f"   Semaines train   : {len(train_weeks)} ({train_weeks[0]} ‚Üí {train_weeks[-1]})")
print(f"   Semaines valid.  : {len(val_weeks)} ({val_weeks[0]} ‚Üí {val_weeks[-1]})")

In [None]:
# Cr√©ation des ensembles train et validation
df_train = df[df['week'].isin(train_weeks)].copy()
df_val = df[df['week'].isin(val_weeks)].copy()

print(f"\nüìä TAILLES DES ENSEMBLES")
print(f"   Train      : {len(df_train)} observations ({len(df_train)/len(df)*100:.1f}%)")
print(f"   Validation : {len(df_val)} observations ({len(df_val)/len(df)*100:.1f}%)")

In [None]:
# Suppression des lignes avec NaN (dues aux lags) pour la mod√©lisation
# On garde une copie avec NaN pour certains mod√®les
df_train_clean = df_train.dropna().copy()
df_val_clean = df_val.dropna().copy()

print(f"\nüìä APR√àS SUPPRESSION DES NaN (dus aux lags)")
print(f"   Train      : {len(df_train_clean)} observations")
print(f"   Validation : {len(df_val_clean)} observations")

## <span style="color: green;">5. D√©finition des Features et Target</span>

In [None]:
# Variable cible
target = 'TauxGrippe'

# Features de base (sans dummies r√©gions)
base_features = [
    # Lags
    'TauxGrippe_lag1', 'TauxGrippe_lag2', 'TauxGrippe_lag3', 'TauxGrippe_lag4',
    'requete_grippe_lag1', 'requete_grippe_lag2',
    # Rolling
    'TauxGrippe_rolling_mean_4', 'TauxGrippe_rolling_mean_8',
    'TauxGrippe_rolling_std_4', 'TauxGrippe_rolling_std_8',
    # Requ√™tes Google
    'requete_grippe', 'requete_grippe_aviaire_vaccin',
    # Temporel
    'week_num', 'month_num', 'quarter', 'saison_grippe',
    'sin_week', 'cos_week', 'sin_month', 'cos_month',
    # D√©mographie (ratios)
    'ratio_jeunes', 'ratio_seniors', 'ratio_actifs',
    'pop_total'
]

# Features avec dummies r√©gions
region_dummy_cols = [col for col in df.columns if col.startswith('region_')]
features_with_dummies = base_features + region_dummy_cols

# Features avec label encoding r√©gion
features_with_label = base_features + ['region_encoded']

print(f"üìä FEATURES D√âFINIES")
print(f"   Base features          : {len(base_features)}")
print(f"   Avec dummies r√©gions   : {len(features_with_dummies)}")
print(f"   Avec label encoding    : {len(features_with_label)}")

In [None]:
# Pr√©paration des donn√©es pour les mod√®les ML
# V√©rifier que toutes les features existent
available_features = [f for f in features_with_dummies if f in df_train_clean.columns]
missing_features = [f for f in features_with_dummies if f not in df_train_clean.columns]

if missing_features:
    print(f"‚ö†Ô∏è Features manquantes : {missing_features}")
    features_with_dummies = available_features

X_train = df_train_clean[features_with_dummies]
y_train = df_train_clean[target]
X_val = df_val_clean[features_with_dummies]
y_val = df_val_clean[target]

print(f"\nüìä DIMENSIONS FINALES")
print(f"   X_train : {X_train.shape}")
print(f"   y_train : {y_train.shape}")
print(f"   X_val   : {X_val.shape}")
print(f"   y_val   : {y_val.shape}")

## <span style="color: green;">6. Fonctions d'√âvaluation</span>

In [None]:
def evaluate_model(y_true, y_pred, model_name="Model"):
    """
    Calcule et affiche les m√©triques d'√©valuation.
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100  # +epsilon pour √©viter div/0
    
    print(f"\nüìä {model_name}")
    print(f"   RMSE : {rmse:.2f}")
    print(f"   MAE  : {mae:.2f}")
    print(f"   R¬≤   : {r2:.4f}")
    print(f"   MAPE : {mape:.2f}%")
    
    return {'model': model_name, 'RMSE': rmse, 'MAE': mae, 'R2': r2, 'MAPE': mape}


def plot_predictions(y_true, y_pred, model_name="Model", n_points=200):
    """
    Visualise les pr√©dictions vs valeurs r√©elles.
    """
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Scatter plot
    axes[0].scatter(y_true[:n_points], y_pred[:n_points], alpha=0.5, color='steelblue')
    axes[0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', lw=2)
    axes[0].set_xlabel('Valeurs R√©elles')
    axes[0].set_ylabel('Pr√©dictions')
    axes[0].set_title(f'{model_name} - Pr√©dictions vs R√©elles')
    
    # Distribution des erreurs
    errors = y_pred - y_true
    axes[1].hist(errors, bins=50, edgecolor='black', alpha=0.7, color='coral')
    axes[1].axvline(0, color='red', linestyle='--', lw=2)
    axes[1].set_xlabel('Erreur (Pr√©diction - R√©el)')
    axes[1].set_ylabel('Fr√©quence')
    axes[1].set_title(f'{model_name} - Distribution des Erreurs')
    
    plt.tight_layout()
    plt.show()

# Stockage des r√©sultats
results = []
print("‚úÖ Fonctions d'√©valuation d√©finies")

---
# <span style="color: blue;">PARTIE 1 : MOD√àLES NA√èFS (BASELINE)</span>
---

## <span style="color: green;">7. Mod√®les Na√Øfs</span>

### 7.1 Mod√®le Na√Øf Simple (Persistance)

$$\hat{y}_{T+h|T} = y_T$$

La pr√©diction est √©gale √† la derni√®re observation connue.

In [None]:
# Mod√®le Na√Øf Simple : pr√©diction = valeur de la semaine pr√©c√©dente (lag1)
y_pred_naive_simple = df_val_clean['TauxGrippe_lag1'].values

# √âvaluation
res_naive_simple = evaluate_model(y_val.values, y_pred_naive_simple, "Na√Øf Simple (Persistance)")
results.append(res_naive_simple)

plot_predictions(y_val.values, y_pred_naive_simple, "Na√Øf Simple")

### 7.2 Mod√®le Na√Øf Saisonnier

$$\hat{y}_{T+h|T} = y_{T+h-km}$$

o√π m = p√©riode saisonni√®re (52 semaines) et k = nombre d'ann√©es compl√®tes.

In [None]:
# Mod√®le Na√Øf Saisonnier : pr√©diction = valeur de la m√™me semaine l'ann√©e pr√©c√©dente
# On cr√©e un lag de 52 semaines
df_temp = df.copy()
df_temp['TauxGrippe_lag52'] = df_temp.groupby('region_name')['TauxGrippe'].shift(52)

# Filtrer sur la p√©riode de validation
df_val_seasonal = df_temp[df_temp['week'].isin(val_weeks)].dropna(subset=['TauxGrippe_lag52'])

y_true_seasonal = df_val_seasonal['TauxGrippe'].values
y_pred_naive_seasonal = df_val_seasonal['TauxGrippe_lag52'].values

if len(y_pred_naive_seasonal) > 0:
    res_naive_seasonal = evaluate_model(y_true_seasonal, y_pred_naive_seasonal, "Na√Øf Saisonnier (lag 52)")
    results.append(res_naive_seasonal)
    plot_predictions(y_true_seasonal, y_pred_naive_seasonal, "Na√Øf Saisonnier")
else:
    print("‚ö†Ô∏è Pas assez de donn√©es pour le mod√®le na√Øf saisonnier (n√©cessite 1 an d'historique)")

### 7.3 Moyenne Mobile

$$\hat{y}_{T+h|T} = \bar{y} = \frac{1}{T}(y_1 + ... + y_T)$$

In [None]:
# Mod√®le Moyenne Mobile (sur 4 semaines)
y_pred_ma4 = df_val_clean['TauxGrippe_rolling_mean_4'].values

res_ma4 = evaluate_model(y_val.values, y_pred_ma4, "Moyenne Mobile (4 semaines)")
results.append(res_ma4)

plot_predictions(y_val.values, y_pred_ma4, "Moyenne Mobile 4")

### 7.4 Moyenne Historique par R√©gion

In [None]:
# Moyenne historique par r√©gion (calcul√©e sur le train)
mean_by_region = df_train.groupby('region_name')['TauxGrippe'].mean()

# Appliquer aux donn√©es de validation
y_pred_mean_region = df_val_clean['region_name'].map(mean_by_region).values

res_mean_region = evaluate_model(y_val.values, y_pred_mean_region, "Moyenne Historique par R√©gion")
results.append(res_mean_region)

plot_predictions(y_val.values, y_pred_mean_region, "Moyenne par R√©gion")

### 7.5 Mod√®le D√©rive (Drift)

$$\hat{y}_{T+h|T} = y_T + h \left( \frac{y_T - y_1}{T-1} \right)$$

In [None]:
# Mod√®le D√©rive par r√©gion
def compute_drift_prediction(group):
    """Calcule la pr√©diction par d√©rive pour un groupe (r√©gion)."""
    y_T = group['TauxGrippe_lag1'].values
    y_1 = group['TauxGrippe_lag4'].values  # Approximation avec lag4
    T = 4  # Fen√™tre de 4 semaines
    h = 1  # Horizon de 1 semaine
    drift = h * (y_T - y_1) / (T - 1)
    return y_T + drift

# Application
y_pred_drift = df_val_clean.groupby('region_name').apply(
    lambda x: pd.Series(compute_drift_prediction(x), index=x.index)
).values.flatten()

# R√©ordonner selon l'index original
df_val_clean['pred_drift'] = compute_drift_prediction(df_val_clean)
y_pred_drift = df_val_clean['pred_drift'].values

res_drift = evaluate_model(y_val.values, y_pred_drift, "Mod√®le D√©rive (Drift)")
results.append(res_drift)

# Nettoyer
df_val_clean.drop('pred_drift', axis=1, inplace=True)

---
# <span style="color: blue;">PARTIE 2 : R√âGRESSION LIN√âAIRE</span>
---

## <span style="color: green;">8. R√©gression Lin√©aire</span>

### 8.1 R√©gression Lin√©aire Simple

In [None]:
# R√©gression Lin√©aire simple
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_val)

res_lr = evaluate_model(y_val.values, y_pred_lr, "R√©gression Lin√©aire")
results.append(res_lr)

plot_predictions(y_val.values, y_pred_lr, "R√©gression Lin√©aire")

In [None]:
# Coefficients les plus importants
coef_df = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lr.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("üìä Top 10 coefficients (R√©gression Lin√©aire) :")
coef_df.head(10)

### 8.2 R√©gression Ridge (L2)

In [None]:
# R√©gression Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_val)

res_ridge = evaluate_model(y_val.values, y_pred_ridge, "R√©gression Ridge")
results.append(res_ridge)

plot_predictions(y_val.values, y_pred_ridge, "Ridge")

### 8.3 R√©gression Lasso (L1)

In [None]:
# R√©gression Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

y_pred_lasso = lasso.predict(X_val)

res_lasso = evaluate_model(y_val.values, y_pred_lasso, "R√©gression Lasso")
results.append(res_lasso)

# Features s√©lectionn√©es par Lasso
lasso_coef = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lasso.coef_
})
n_selected = (lasso_coef['coefficient'] != 0).sum()
print(f"\nüìä Lasso a s√©lectionn√© {n_selected}/{len(X_train.columns)} features")

---
# <span style="color: blue;">PARTIE 3 : MOD√àLES ARIMA</span>
---

## <span style="color: green;">9. Mod√®les ARIMA</span>

ARIMA n√©cessite une s√©rie temporelle univari√©e. On va l'appliquer par r√©gion.

In [None]:
# Test de stationnarit√© (Augmented Dickey-Fuller)
def test_stationarity(series, region_name):
    """Test ADF pour v√©rifier la stationnarit√©."""
    result = adfuller(series.dropna())
    print(f"\nüìä Test ADF - {region_name}")
    print(f"   Statistique ADF : {result[0]:.4f}")
    print(f"   p-value         : {result[1]:.4f}")
    print(f"   Stationnaire    : {'Oui' if result[1] < 0.05 else 'Non'}")
    return result[1] < 0.05

# Test sur une r√©gion exemple
region_test = 'ILE-DE-FRANCE'
series_test = df_train[df_train['region_name'] == region_test]['TauxGrippe']
test_stationarity(series_test, region_test)

In [None]:
# ARIMA par r√©gion (on prend quelques r√©gions pour l'exemple)
regions_to_model = df['region_name'].unique()[:5]  # 5 premi√®res r√©gions

arima_predictions = []
arima_actuals = []

for region in regions_to_model:
    print(f"\nüîÑ Mod√®le ARIMA pour {region}...")
    
    # Donn√©es de la r√©gion
    train_region = df_train[df_train['region_name'] == region]['TauxGrippe'].values
    val_region = df_val[df_val['region_name'] == region]['TauxGrippe'].values
    
    try:
        # Fit ARIMA(1,1,1) - param√®tres simples
        model = ARIMA(train_region, order=(1, 1, 1))
        fitted = model.fit()
        
        # Pr√©dictions
        predictions = fitted.forecast(steps=len(val_region))
        
        arima_predictions.extend(predictions)
        arima_actuals.extend(val_region)
        
        rmse = np.sqrt(mean_squared_error(val_region, predictions))
        print(f"   RMSE : {rmse:.2f}")
        
    except Exception as e:
        print(f"   ‚ö†Ô∏è Erreur : {e}")

# √âvaluation globale ARIMA
if len(arima_predictions) > 0:
    res_arima = evaluate_model(np.array(arima_actuals), np.array(arima_predictions), "ARIMA(1,1,1)")
    results.append(res_arima)

---
# <span style="color: blue;">PARTIE 4 : MOD√àLES MACHINE LEARNING</span>
---

## <span style="color: green;">10. Random Forest</span>

In [None]:
# Random Forest
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

res_rf = evaluate_model(y_val.values, y_pred_rf, "Random Forest")
results.append(res_rf)

plot_predictions(y_val.values, y_pred_rf, "Random Forest")

In [None]:
# Feature Importance - Random Forest
fi_rf = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Visualisation
plt.figure(figsize=(12, 8))
plt.barh(fi_rf['feature'][:15], fi_rf['importance'][:15], color='forestgreen')
plt.xlabel('Importance')
plt.title('Top 15 Features - Random Forest', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

fi_rf.head(15)

## <span style="color: green;">11. XGBoost</span>

In [None]:
# XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)],
              verbose=False)

y_pred_xgb = xgb_model.predict(X_val)

res_xgb = evaluate_model(y_val.values, y_pred_xgb, "XGBoost")
results.append(res_xgb)

plot_predictions(y_val.values, y_pred_xgb, "XGBoost")

In [None]:
# Feature Importance - XGBoost
fi_xgb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
plt.barh(fi_xgb['feature'][:15], fi_xgb['importance'][:15], color='darkorange')
plt.xlabel('Importance')
plt.title('Top 15 Features - XGBoost', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## <span style="color: green;">12. LightGBM</span>

In [None]:
# LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgb_model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)])

y_pred_lgb = lgb_model.predict(X_val)

res_lgb = evaluate_model(y_val.values, y_pred_lgb, "LightGBM")
results.append(res_lgb)

plot_predictions(y_val.values, y_pred_lgb, "LightGBM")

---
# <span style="color: blue;">PARTIE 5 : BENCHMARK & COMPARAISON</span>
---

## <span style="color: green;">13. Tableau R√©capitulatif des Performances</span>

In [None]:
# Cr√©ation du DataFrame de benchmark
benchmark_df = pd.DataFrame(results)
benchmark_df = benchmark_df.sort_values('RMSE')

print("="*80)
print("                    üìä BENCHMARK DES MOD√àLES")
print("="*80)
benchmark_df

In [None]:
# Visualisation du benchmark
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# RMSE
colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(benchmark_df)))
axes[0].barh(benchmark_df['model'], benchmark_df['RMSE'], color=colors)
axes[0].set_xlabel('RMSE (plus bas = meilleur)')
axes[0].set_title('Comparaison RMSE', fontsize=12, fontweight='bold')
axes[0].invert_yaxis()

# MAE
axes[1].barh(benchmark_df['model'], benchmark_df['MAE'], color=colors)
axes[1].set_xlabel('MAE (plus bas = meilleur)')
axes[1].set_title('Comparaison MAE', fontsize=12, fontweight='bold')
axes[1].invert_yaxis()

# R¬≤
benchmark_sorted_r2 = benchmark_df.sort_values('R2', ascending=False)
colors_r2 = plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(benchmark_sorted_r2)))
axes[2].barh(benchmark_sorted_r2['model'], benchmark_sorted_r2['R2'], color=colors_r2)
axes[2].set_xlabel('R¬≤ (plus haut = meilleur)')
axes[2].set_title('Comparaison R¬≤', fontsize=12, fontweight='bold')
axes[2].invert_yaxis()

plt.tight_layout()
plt.savefig('benchmark_modeles.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Meilleur mod√®le
best_model = benchmark_df.iloc[0]

print("\n" + "="*60)
print("üèÜ MEILLEUR MOD√àLE")
print("="*60)
print(f"   Mod√®le : {best_model['model']}")
print(f"   RMSE   : {best_model['RMSE']:.2f}")
print(f"   MAE    : {best_model['MAE']:.2f}")
print(f"   R¬≤     : {best_model['R2']:.4f}")
print(f"   MAPE   : {best_model['MAPE']:.2f}%")

## <span style="color: green;">14. G√©n√©ration des Submissions CSV</span>

In [None]:
# Fonction pour cr√©er un fichier submission
def create_submission(model, model_name, X_data, df_data, filename):
    """
    Cr√©e un fichier CSV de soumission au format attendu.
    """
    predictions = model.predict(X_data)
    
    submission = pd.DataFrame({
        'Id': df_data['Id'].values,
        'week': df_data['week'].values,
        'region_name': df_data['region_name'].values,
        'TauxGrippe_pred': predictions
    })
    
    submission.to_csv(filename, index=False)
    print(f"‚úÖ Submission sauvegard√©e : {filename}")
    return submission

# Cr√©er les submissions pour les meilleurs mod√®les
print("\nüìÑ G√âN√âRATION DES FICHIERS SUBMISSION")
print("-" * 50)

In [None]:
# Submission Random Forest
sub_rf = create_submission(rf, "Random Forest", X_val, df_val_clean, "submission_random_forest.csv")
sub_rf.head()

In [None]:
# Submission XGBoost
sub_xgb = create_submission(xgb_model, "XGBoost", X_val, df_val_clean, "submission_xgboost.csv")
sub_xgb.head()

In [None]:
# Submission LightGBM
sub_lgb = create_submission(lgb_model, "LightGBM", X_val, df_val_clean, "submission_lightgbm.csv")
sub_lgb.head()

## <span style="color: green;">15. Sauvegarde du Benchmark</span>

In [None]:
# Sauvegarde du tableau de benchmark
benchmark_df.to_csv('benchmark_modeles.csv', index=False)
print("‚úÖ Benchmark sauvegard√© : benchmark_modeles.csv")

# Affichage final
print("\n" + "="*80)
print("                    ‚úÖ MOD√âLISATION TERMIN√âE")
print("="*80)
print("\nüìÅ Fichiers g√©n√©r√©s :")
print("   ‚Ä¢ benchmark_modeles.csv")
print("   ‚Ä¢ benchmark_modeles.png")
print("   ‚Ä¢ submission_random_forest.csv")
print("   ‚Ä¢ submission_xgboost.csv")
print("   ‚Ä¢ submission_lightgbm.csv")

---
## <span style="color: green;">16. R√©sum√© & Conclusions</span>

In [None]:
print("="*80)
print("                    üìä R√âSUM√â DE LA MOD√âLISATION")
print("="*80)

print("\nüìå DONN√âES")
print("-" * 40)
print(f"   Observations totales  : {len(df):,}")
print(f"   Train                 : {len(df_train_clean):,}")
print(f"   Validation            : {len(df_val_clean):,}")
print(f"   Features utilis√©es    : {len(features_with_dummies)}")

print("\nüìå MOD√àLES TEST√âS")
print("-" * 40)
print("   ‚Ä¢ Mod√®les Na√Øfs : Persistance, Saisonnier, Moyenne Mobile, Drift")
print("   ‚Ä¢ R√©gression : Lin√©aire, Ridge, Lasso")
print("   ‚Ä¢ S√©ries Temporelles : ARIMA")
print("   ‚Ä¢ Machine Learning : Random Forest, XGBoost, LightGBM")

print("\nüìå R√âSULTATS")
print("-" * 40)
print(f"   üèÜ Meilleur mod√®le : {best_model['model']}")
print(f"      RMSE : {best_model['RMSE']:.2f}")
print(f"      R¬≤   : {best_model['R2']:.4f}")

print("\nüìå OBSERVATIONS CL√âS")
print("-" * 40)
print("   ‚úì Les lags (valeurs pass√©es) sont les features les plus importantes")
print("   ‚úì Les requ√™tes Google apportent de l'information pr√©dictive")
print("   ‚úì La saisonnalit√© est bien capt√©e par les mod√®les ML")
print("   ‚úì Les mod√®les ensemblistes (RF, XGB, LGB) surpassent les baselines")

print("\n" + "="*80)