# üìä M5 Forecasting - Exploration, Visualisation & Mod√©lisation (RNN, LSTM, GRU)
### Objectifs :
1Ô∏è‚É£ Explorer les donn√©es afin de comprendre leur structure et leurs tendances g√©n√©rales.
2Ô∏è‚É£ Visualiser l‚Äô√©volution des ventes.
3Ô∏è‚É£ Utiliser les donn√©es disponibles pour entra√Æner des mod√®les RNN, LSTM et GRU afin de pr√©dire les ventes futures.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, GRU, SimpleRNN
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = '/content/drive/MyDrive/M5_Forecasting/'
sales = pd.read_csv(path + 'sales_train_evaluation.csv')
calendar = pd.read_csv(path + 'calendar.csv')
prices = pd.read_csv(path + 'sell_prices.csv')
print('‚úÖ Donn√©es charg√©es avec succ√®s !')
print('Sales shape:', sales.shape)

In [None]:
print('Apercu du calendrier:')
print(calendar.head())
print('\nApercu des prix:')
print(prices.head())
print('\nNombre de produits uniques:', sales.shape[0])
print('Nombre de jours de ventes:', sales.shape[1] - 6)

In [None]:
print('Apercu des donnees de ventes:')
print(sales.head())
print('\nInformations sur les colonnes:')
print(sales.info())
print('\nStatistiques descriptives:')
print(sales.describe())

## Exploration des Donnees

Analysons la structure des donnees avant de continuer.

In [None]:
item_id = sales.sample(1, random_state=42)['id'].values[0]
ts = sales[sales['id'] == item_id].iloc[0, 6:].reset_index(drop=True)
plt.figure(figsize=(12,5))
plt.plot(ts)
plt.title(f'√âvolution des ventes pour {item_id}')
plt.xlabel('Jour')
plt.ylabel('Ventes')
plt.show()

In [None]:
total_sales = sales.iloc[:, 6:].sum(axis=0)
plt.figure(figsize=(14, 5))
plt.plot(total_sales.values, linewidth=1)
plt.title('Evolution des Ventes Totales (Tous Produits)')
plt.xlabel('Jour')
plt.ylabel('Ventes Totales')
plt.grid(True, alpha=0.3)
plt.show()
print(f'Ventes totales moyennes par jour: {total_sales.mean():.2f}')
print(f'Ventes totales max: {total_sales.max():.0f}')
print(f'Ventes totales min: {total_sales.min():.0f}')

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 10))
for i in range(3):
    item = sales.sample(1, random_state=i*10)['id'].values[0]
    ts_sample = sales[sales['id'] == item].iloc[0, 6:].reset_index(drop=True)
    axes[i].plot(ts_sample, linewidth=0.8)
    axes[i].set_title(f'Serie temporelle: {item}')
    axes[i].set_xlabel('Jour')
    axes[i].set_ylabel('Ventes')
    axes[i].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Visualisation des Tendances

Explorons plusieurs series temporelles pour comprendre les differents patterns.

In [None]:
data = ts[-1000:].values.reshape(-1, 1)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)
SEQ_LEN = 30
X, y = create_sequences(data_scaled, SEQ_LEN)
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

In [None]:
print(f'Forme des donnees apres extraction: {data.shape}')
print(f'Nombre de sequences creees: {len(X)}')
print(f'Taille de lensemble dentrainement: {len(X_train)}')
print(f'Taille de lensemble de test: {len(X_test)}')
print(f'Forme de X_train: {X_train.shape}')
print(f'Forme de y_train: {y_train.shape}')

## Preparation des Donnees pour les Modeles RNN

Normalisation et creation des sequences temporelles.

In [None]:
def train_model(model_type='RNN'):
    model = Sequential()
    if model_type == 'RNN':
        model.add(SimpleRNN(50, activation='tanh', input_shape=(SEQ_LEN, 1)))
    elif model_type == 'LSTM':
        model.add(LSTM(50, activation='tanh', input_shape=(SEQ_LEN, 1)))
    elif model_type == 'GRU':
        model.add(GRU(50, activation='tanh', input_shape=(SEQ_LEN, 1)))
    model.add(Dense(1))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=32, verbose=0)
    y_pred = model.predict(X_test)
    y_pred_inv = scaler.inverse_transform(y_pred)
    y_test_inv = scaler.inverse_transform(y_test)
    mae = mean_absolute_error(y_test_inv, y_pred_inv)
    return model, history, mae, y_pred_inv, y_test_inv

## Construction et Entrainement des Modeles

Nous allons comparer trois architectures de reseaux de neurones recurrents.

In [None]:
models = {}
histories = {}
maes = {}
for model_type in ['RNN', 'LSTM', 'GRU']:
    print(f'
üöÄ Entra√Ænement du mod√®le {model_type}...')
    model, history, mae, y_pred, y_true = train_model(model_type)
    models[model_type] = model
    histories[model_type] = history
    maes[model_type] = mae
    print(f'MAE ({model_type}): {mae:.3f}')

In [None]:
plt.figure(figsize=(14,6))
for model_type in histories.keys():
    plt.plot(histories[model_type].history['mae'], label=f'{model_type} - Train')
    plt.plot(histories[model_type].history['val_mae'], linestyle='--', label=f'{model_type} - Val')
plt.title('Comparaison des MAE (entra√Ænement & validation)')
plt.xlabel('√âpoque')
plt.ylabel('MAE')
plt.legend()
plt.show()

In [None]:
print('RESUME DES RESULTATS')
print('='*70)
print(f'\nMeilleur modele selon MAE: {results_df.loc[results_df["MAE"].idxmin(), "Modele"]}')
print(f'MAE du meilleur modele: {results_df["MAE"].min():.4f}')
print(f'\nMeilleur modele selon R2: {results_df.loc[results_df["R2"].idxmax(), "Modele"]}')
print(f'R2 du meilleur modele: {results_df["R2"].max():.4f}')

print('\n\nAMELIORATIONS POSSIBLES:')
print('-' * 70)
print('1. Augmenter le nombre de neurones (actuellement 50)')
print('2. Ajouter des couches recurrentes supplementaires (modele plus profond)')
print('3. Tester differentes longueurs de sequences (actuellement 30)')
print('4. Utiliser Bidirectional LSTM/GRU')
print('5. Ajouter du Dropout pour regularisation')
print('6. Augmenter le nombre depoques dentrainement')
print('7. Utiliser Early Stopping et Learning Rate Scheduling')
print('8. Incorporer des features externes (calendrier, prix, promotions)')
print('9. Essayer des architectures hybrides (CNN-LSTM)')
print('10. Utiliser des techniques densemble (moyenne des predictions)')

## Conclusion et Recommandations

Resumons les resultats et proposons des ameliorations possibles.

In [None]:
for model_type in ['RNN', 'LSTM', 'GRU']:
    print(f'\n{"="*60}')
    print(f'Architecture du modele {model_type}:')
    print(f'{"="*60}')
    models[model_type].summary()
    print(f'\nNombre total de parametres: {models[model_type].count_params():,}')

## Architecture des Modeles

Affichons les details de larchitecture de chaque modele.

In [None]:
plt.figure(figsize=(14, 6))
for model_type in histories.keys():
    plt.plot(histories[model_type].history['loss'], label=f'{model_type} - Train Loss', linewidth=2)
    plt.plot(histories[model_type].history['val_loss'], linestyle='--', label=f'{model_type} - Val Loss', linewidth=2)
plt.title('Comparaison des Fonctions de Perte (MSE)')
plt.xlabel('Epoque')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Comparaison des Courbes dApprentissage

Analysons levolution de la perte pendant lentrainement.

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(14, 12))
for idx, model_type in enumerate(['RNN', 'LSTM', 'GRU']):
    model = models[model_type]
    y_pred = model.predict(X_test)
    y_pred_inv = scaler.inverse_transform(y_pred)
    y_test_inv = scaler.inverse_transform(y_test)
    
    residuals = y_test_inv - y_pred_inv
    
    axes[idx, 0].scatter(y_pred_inv, residuals, alpha=0.5, s=10)
    axes[idx, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
    axes[idx, 0].set_title(f'Residus vs Predictions ({model_type})')
    axes[idx, 0].set_xlabel('Predictions')
    axes[idx, 0].set_ylabel('Residus')
    axes[idx, 0].grid(True, alpha=0.3)
    
    axes[idx, 1].hist(residuals, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[idx, 1].axvline(x=0, color='r', linestyle='--', linewidth=2)
    axes[idx, 1].set_title(f'Distribution des Residus ({model_type})')
    axes[idx, 1].set_xlabel('Residus')
    axes[idx, 1].set_ylabel('Frequence')
    axes[idx, 1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Analyse des Residus

Analysons les erreurs de prediction pour comprendre les forces et faiblesses de chaque modele.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
metrics = ['MAE', 'RMSE', 'R2', 'MAPE']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    values = results_df[metric].values
    colors = ['#3498db', '#e74c3c', '#2ecc71']
    bars = ax.bar(results_df['Modele'], values, color=colors, alpha=0.7)
    ax.set_title(f'Comparaison: {metric}')
    ax.set_ylabel(metric)
    ax.grid(True, alpha=0.3, axis='y')
    
    for bar, value in zip(bars, values):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{value:.4f}', ha='center', va='bottom', fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

results = []
for model_type in ['RNN', 'LSTM', 'GRU']:
    model = models[model_type]
    y_pred = model.predict(X_test)
    y_pred_inv = scaler.inverse_transform(y_pred)
    y_test_inv = scaler.inverse_transform(y_test)
    
    mae = mean_absolute_error(y_test_inv, y_pred_inv)
    mse = mean_squared_error(y_test_inv, y_pred_inv)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test_inv, y_pred_inv)
    mape = mean_absolute_percentage_error(y_test_inv, y_pred_inv)
    
    results.append({
        'Modele': model_type,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2,
        'MAPE': mape
    })

results_df = pd.DataFrame(results)
print('\nTableau Comparatif des Performances:')
print(results_df.to_string(index=False))
print(f'\nMeilleur modele (MAE): {results_df.loc[results_df["MAE"].idxmin(), "Modele"]}')

## Metriques de Performance Detaillees

Calculons plusieurs metriques pour evaluer les modeles.

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 12))
for idx, model_type in enumerate(['RNN', 'LSTM', 'GRU']):
    model = models[model_type]
    y_pred = model.predict(X_test)
    y_pred_inv = scaler.inverse_transform(y_pred)
    y_test_inv = scaler.inverse_transform(y_test)
    
    axes[idx].plot(y_test_inv[:100], label='Valeurs Reelles', linewidth=2, alpha=0.7)
    axes[idx].plot(y_pred_inv[:100], label='Predictions', linewidth=2, alpha=0.7)
    axes[idx].set_title(f'Predictions du modele {model_type} (MAE: {maes[model_type]:.3f})')
    axes[idx].set_xlabel('Temps')
    axes[idx].set_ylabel('Ventes')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()