# PREPARATION

## Driving variables

In [22]:
SKIP_SECTION_COLAB = True
SKIP_SECTION_KAGGLE = not SKIP_SECTION_COLAB

## Load libraries

In [23]:
%pip install mplcursors

Note: you may need to restart the kernel to use updated packages.


In [24]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Bidirectional
from keras.callbacks import Callback
from IPython.display import clear_output

import datetime
from tensorflow.keras.callbacks import EarlyStopping
import gc
import json

from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Reshape
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Model
import mplcursors
from keras.layers import Input
import numpy as np
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.saving import register_keras_serializable
from tensorflow.keras.models import Sequential
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
import sys
from tensorflow.keras.callbacks import TensorBoard
import tensorflow as tf
from sklearn.model_selection import train_test_split
import random
import torch
import torch.nn as nn
import torch.optim as optim


## Check GPU

In [25]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [26]:
# Limit TensorFlow to only use the first GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set TensorFlow to only use the GPU and not fall back to CPU
        tf.config.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("GPU is set up and ready to use.")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found, using CPU instead.")

GPU is set up and ready to use.


In [27]:
# Configure memory growth for GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print("Error:", e)

## Mount disk and define paths

In [28]:
# Check disk content (Kaggle) or mount disk (Google Colab)
if not SKIP_SECTION_KAGGLE:
  os.listdir('/kaggle/input')

if not SKIP_SECTION_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

In [None]:
# Data paths
if not SKIP_SECTION_KAGGLE:
  path_data = "/kaggle/input/dossier-donnees/dossier_donnees/jour2/data/"
  path_preprocessed_data = "/kaggle/input/dossier-donnees/dossier_donnees/jour2/data/preprocessed_data/"
  path_raw_data = "/kaggle/input/dossier-donnees/dossier_donnees/jour2/data/raw_data/"    
  path_preprocessed_data_window_7 = "/kaggle/input/preprocessed-data-7/"

if not SKIP_SECTION_COLAB:
  path_data = "/content/drive/MyDrive/DSTI/Hackathon/Time Series/Material_Preparation/Phase_2/dossier_donnees/jour2/data/"
  path_preprocessed_data = "/content/drive/MyDrive/DSTI/Hackathon/Time Series/Material_Preparation/Phase_2/dossier_donnees/jour2/data/preprocessed_data/"
  path_raw_data = "/content/drive/MyDrive/DSTI/Hackathon/Time Series/Material_Preparation/Phase_2/dossier_donnees/jour2/data/raw_data/"


In [30]:
# Define the log directory path
###log_dir = "kaggle/input/logs/fit/"

# Create the directory if it doesn't exist
###os.makedirs(log_dir, exist_ok=True)

# Print confirmation
###print(f"Directory {log_dir} created for TensorBoard logs.")

# DONNEES

## Configurer les générateurs de nombres aléatoires

In [31]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)

## PREMIER PREPROCESSING - Raw data

## SECOND PREPROCESSING - Fenêtres glissantes de données

Ces deux preprocessing sont effectué par le script présent dans l'Application fournie et renvoie en sortie les 4 fichiers csv utilisés ci-dessous après leur enregistrement dans un dossier dédié (path_preprocessed_data) : x_train, x_train, x_valid, y_valid.

In [32]:
# Structure des modèles par défaut
taille_fenetre_a_predire_mapping = {
    "taille_fenetre_a_predire": {
        1: {
            "taille_fenetre_observee": 12,
            "taille_pas_glissant_train": 13,
            "taille_pas_glissant_valid": 13
        },
        5: {
            "taille_fenetre_observee": 60,
            "taille_pas_glissant_train": 13,
            "taille_pas_glissant_valid": 65
        },
        30: {
            "taille_fenetre_observee": 300,
            "taille_pas_glissant_train": 13,
            "taille_pas_glissant_valid": 65
        },
        60: {
            "taille_fenetre_observee": 400,
            "taille_pas_glissant_train": 13,
            "taille_pas_glissant_valid": 65
        },
        300: {
            "taille_fenetre_observee": 500,
            "taille_pas_glissant_train": 13,
            "taille_pas_glissant_valid": 65
        }
    }
}

taille_fenetre_a_predire = 60
taille_fenetre_observee, taille_pas_glissant_train, taille_pas_glissant_valid = \
taille_fenetre_a_predire_mapping["taille_fenetre_a_predire"][taille_fenetre_a_predire].values()

print("Dimensions choisies:")
print("taille_fenetre_a_predire:", taille_fenetre_a_predire)
print("taille_fenetre_observee:", taille_fenetre_observee)
print("taille_pas_glissant_train:", taille_pas_glissant_train)
print("taille_pas_glissant_valid:", taille_pas_glissant_valid)

Dimensions choisies:
taille_fenetre_a_predire: 60
taille_fenetre_observee: 400
taille_pas_glissant_train: 13
taille_pas_glissant_valid: 65


## *** Section pour choisir directement des fichiers x_train, y_train, x_valid, y_valid déjà préprocessés


In [33]:
# Fichiers x_train, y_train, x_valid, y_valid déjà préprocessés
print("Dimensions choisies:")
print("taille_fenetre_a_predire:", taille_fenetre_a_predire)
print("taille_fenetre_observee:", taille_fenetre_observee)
print("taille_pas_glissant_train:", taille_pas_glissant_train)
print("taille_pas_glissant_valid:", taille_pas_glissant_valid)

# Choix ces CSV déjà préprocessés, correspondant à l'horizon de prédiction choisi ci-dessus
# Version du préprocessing : Novembre 2024 (Phase 1)
#path_preprocessed_data = "/kaggle/input/dossier-donnees/dossier_donnees/jour2/data/preprocessed_data/"
#x_train = pd.read_csv(f"{path_preprocessed_data}x_train.csv", header=None)
#y_train = pd.read_csv(f"{path_preprocessed_data}y_train.csv", header=None)
#x_valid = pd.read_csv(f"{path_preprocessed_data}x_valid.csv", header=None)
#y_valid = pd.read_csv(f"{path_preprocessed_data}y_valid.csv", header=None)

# Version du préprocessing : Janvier 2025
# Exécuter le code de la section juste avant, qui est en dehors de cette section raccourci (## Préprocessing - Générer les données préprocessées)
# Données dupliquées regroupées et périodicités non constantes lissées linéairement

# Version du préprocessing : Mars 2025
path_preprocessed_data = "/kaggle/input/new-preprocessing/"
x_train = pd.read_csv(f"{path_preprocessed_data}x_train_s{taille_pas_glissant_train}_o{taille_fenetre_observee}_p{taille_fenetre_a_predire}.csv", header=None)
y_train = pd.read_csv(f"{path_preprocessed_data}y_train_s{taille_pas_glissant_train}_o{taille_fenetre_observee}_p{taille_fenetre_a_predire}.csv", header=None)
x_valid = pd.read_csv(f"{path_preprocessed_data}x_valid_s{taille_pas_glissant_valid}_o{taille_fenetre_observee}_p{taille_fenetre_a_predire}.csv", header=None)
y_valid = pd.read_csv(f"{path_preprocessed_data}y_valid_s{taille_pas_glissant_valid}_o{taille_fenetre_observee}_p{taille_fenetre_a_predire}.csv", header=None)

Dimensions choisies:
taille_fenetre_a_predire: 60
taille_fenetre_observee: 400
taille_pas_glissant_train: 13
taille_pas_glissant_valid: 65


## EDA - EXPLORATION ET ANALYSE DES DONNEES

In [34]:
# Get the number of rows and columns for each dataframe
x_train_shape = x_train.shape
x_valid_shape = x_valid.shape
#x_test_shape = x_test.shape
y_train_shape = y_train.shape
y_valid_shape = y_valid.shape

In [35]:
print(f"x_train - Number of rows: {x_train_shape[0]}, Number of columns: {x_train_shape[1]}")
print(f"x_valid - Number of rows: {x_valid_shape[0]}, Number of columns: {x_valid_shape[1]}")
#print(f"x_test - Number of rows: {x_test_shape[0]}, Number of columns: {x_test_shape[1]}")

x_train - Number of rows: 132019, Number of columns: 400
x_valid - Number of rows: 4240, Number of columns: 400


In [36]:
# Calculate min, max, mean, and std
min_x_train = x_train.values.min()
max_x_train = x_train.values.max()
mean_x_train = x_train.values.mean()
std_x_train = x_train.values.std()

# Display the results
print("*******x_train")
print("Minimum values:\n", min_x_train)
print("\nMaximum values:\n", max_x_train)
print("\nMean (Average) values:\n", mean_x_train)
print("\nStandard Deviation values:\n", std_x_train)

min_y_val = y_valid.values.min()
max_y_val = y_valid.values.max()
mean_y_val = y_valid.values.mean()
std_y_val = y_valid.values.std()

# Display the results
print("*******y_val")
print("Minimum values:\n", min_y_val)
print("\nMaximum values:\n", max_y_val)
print("\nMean (Average) values:\n", mean_y_val)
print("\nStandard Deviation values:\n", std_y_val)

*******x_train
Minimum values:
 176.0

Maximum values:
 762170.0

Mean (Average) values:
 77548.39447941205

Standard Deviation values:
 77134.6381501306
*******y_val
Minimum values:
 176.0

Maximum values:
 381085.0

Mean (Average) values:
 52548.51355738994

Standard Deviation values:
 54288.12722779933


In [37]:
x_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,228664,11624,20584,5648,71096.0,7932.0,82552.0,24512.0,129808.0,61712.0,...,3388.0,8072.0,41976.0,29888.0,16376.0,9524.0,35728.0,83816.0,88696.0,1888.0
1,381085,2086,125152,7976,76976.0,97608.0,21248.0,1628.0,33096.0,3592.0,...,18296.0,17272.0,19272.0,41696.0,176.0,92016.0,17248.0,23376.0,30672.0,63624.0
2,86728,119872,45504,21936,20864.0,11176.0,21528.0,284.0,20352.0,8312.0,...,45712.0,2356.0,50912.0,26776.0,87504.0,12288.0,52752.0,10296.0,1132.0,3316.0
3,33616,75696,2844,1796,3268.0,10784.0,31192.0,18752.0,115392.0,48296.0,...,13532.0,25344.0,65192.0,29896.0,43512.0,21224.0,83864.0,30912.0,77656.0,49888.0
4,27256,38408,31152,252,23448.0,1544.0,3276.0,41584.0,45968.0,2128.0,...,89328.0,2816.0,68864.0,46984.0,4308.0,22712.0,2428.0,61112.0,38792.0,272.0


In [38]:
print(f"x_train shape: {x_train_shape}")
print(f"x_valid shape: {x_valid_shape}")
#print(f"x_test shape: {x_test_shape}")
print(f"y_train shape: {y_train_shape}")
print(f"y_valid shape: {y_valid_shape}")

x_train shape: (132019, 400)
x_valid shape: (4240, 400)
y_train shape: (132019, 60)
y_valid shape: (4240, 60)


## Prepare Data for training - Augmentation & Rescaling

### Augment then standardize

In [40]:
# If you need to apply augmentation, you can first standardize your data
x_train_noisy = x_train + np.random.normal(0, 0.01, x_train.shape)  # Adding Gaussian noise

y_train_noisy = y_train + np.random.normal(0, 0.01, y_train.shape) # for next-step prediction in time series or regression tasks, we should apply the same transformation to maintain consistency.

# Now you have augmented data
# We can apply more augmentations like shifting or time warping here

print("Shape of data:", x_train_noisy.shape)
print("Shape of data:", y_train_noisy.shape)


Shape of data: (132019, 400)
Shape of data: (132019, 60)


In [41]:
# Add x_train_noisy to the bottom of x_train_standardized
x_train_aug = np.vstack((x_train, x_train_noisy))

# Add y_train_noisy to the bottom of y_train_standardized
y_train_aug = np.vstack((y_train, y_train_noisy))

# Check the shapes
print("Shape of combined x_train:", x_train_aug.shape)
print("Shape of combined y_train:", y_train_aug.shape)

Shape of combined x_train: (264038, 400)
Shape of combined y_train: (264038, 60)


In [42]:
# StandardScaler Transformer

x_scaler = StandardScaler()
x_train_aug_std = x_scaler.fit_transform(x_train_aug)
x_valid_standardized = x_scaler.transform(x_valid)

y_scaler = StandardScaler()
y_train_aug_std = y_scaler.fit_transform(y_train_aug)
y_valid_standardized = y_scaler.transform(y_valid)

## Define function tools

In [43]:
def flush():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

## Define Early stopping

In [44]:
# Define EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor the validation loss
    patience=100,         # Number of epochs to wait after the last improvement
    restore_best_weights=True  # Restore the weights of the best model
)

# PYTORCH - MODELES

## Définition de l'architecture du modèle

In [45]:
# BiLSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTMModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.relu = nn.ReLU()
        self.bilstm = nn.LSTM(512, hidden_size, bidirectional=True, batch_first=True, num_layers=2)
        self.bilstm = nn.LSTM(512, hidden_size, bidirectional=True, batch_first=True, num_layers=2)
        self.bilstm = nn.LSTM(512, hidden_size, bidirectional=True, batch_first=True, num_layers=2)
        self.fc2 = nn.Linear(hidden_size * 2, output_size)  # *2 for bidirectional LSTM

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = x.view(-1, 2, 512)  # Adjust based on input
        x, _ = self.bilstm(x)
        x = x[:, -1, :]
        return self.fc2(x)

## Définition de la taille du modèle

In [None]:
# Définition de la taille du modèle
input_size = taille_fenetre_observee
hidden_size = 800
output_size = taille_fenetre_a_predire  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMModel(input_size, hidden_size, output_size).to(device)

# Loss and Optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


x_train_tensor = torch.tensor(x_train_aug_std, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_aug_std, dtype=torch.float32).to(device)
x_valid_tensor = torch.tensor(x_valid_standardized, dtype=torch.float32).to(device)
y_valid_tensor = torch.tensor(y_valid_standardized, dtype=torch.float32).to(device)



## Définition du monitoring de l'entrainement

In [47]:
# Custom Training Monitor
class TrainingMonitorNotebook:
    def __init__(self):
        self.train_loss = []
        self.val_loss = []

    def update_plot(self, epoch, train_loss, val_loss):
        self.train_loss.append(train_loss)
        self.val_loss.append(val_loss)

        clear_output(wait=True)

        plt.figure(figsize=(10, 6))
        train_line, = plt.plot(self.train_loss, label="Training Loss", color='blue')
        val_line, = plt.plot(self.val_loss, label="Validation Loss", color='orange')
        plt.xlabel("Epochs")
        plt.ylabel("Loss")
        plt.title("Training and Validation Loss Over Epochs")
        plt.legend()
        plt.grid(True)

        mplcursors.cursor([train_line, val_line], hover=True)
        plt.show()

# Instantiate the monitor
training_monitor_notebook = TrainingMonitorNotebook()

## Entrainement du modèle

In [None]:
# Entrainement du modèle
epochs = 450
batch_size = 1024

# Training Loop with Time Series Handling
for epoch in range(epochs):

    # Shuffle les données d'entrainement après chaque epoch.
    # Le shuffling est essfectué entre portions de série temporelle pour un meilleur apprentissage,
    # et non entre toutes les données brutes unitaires pour ne pas supprimer les liens de la série temporelle.
    indices = np.arange(len(x_train_tensor))
    np.random.shuffle(indices)
    x_train_tensor = x_train_tensor[indices]
    y_train_tensor = y_train_tensor[indices]

    model.train()

    epoch_loss = 0
    num_batches = 0

    for i in range(0, x_train_tensor.size(0), batch_size):
        batch_x = x_train_tensor[i:i + batch_size]
        batch_y = y_train_tensor[i:i + batch_size]

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        num_batches += 1  # Count actual batches

    avg_train_loss = epoch_loss / num_batches  # Fix loss calculation

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(x_valid_tensor)
        val_loss = criterion(val_outputs, y_valid_tensor).item()

    print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {avg_train_loss:.4f} - Val Loss: {val_loss:.4f}")

    # Update the live plot
    training_monitor_notebook.update_plot(epoch, avg_train_loss, val_loss)

## Evaluation du modèle

In [None]:
# Validation (optional)
model.eval()
with torch.no_grad():
    val_outputs = model(x_valid_tensor)
    val_loss = criterion(val_outputs, y_valid_tensor)
    print(f"Validation Loss: {val_loss.item()}")

In [None]:
val_outputs

In [64]:
# Move val_outputs to CPU, then convert to NumPy and perform inverse scaling
y_pred = y_scaler.inverse_transform(val_outputs.cpu().numpy()).astype(int)
#y_pred = scaler.inverse_transform(val_outputs.cpu().numpy()).astype(int)

In [None]:
print("y_pred.shape: ", y_pred.shape, "\n")

print(y_pred[:4])

print("\ny_pred.min(): ",y_pred.min())

In [66]:
# Replace all negative values with zero
y_pred = np.where(y_pred < 0, 0, y_pred)

In [None]:
# Calculer le nRMSE
nrmse_KPI = round(np.sqrt(mean_squared_error(y_valid, y_pred)) / (max_y_val - min_y_val), 5)
print(f'Validation nRMSE: {nrmse_KPI}')

# Calculer le RMSE
rmse_KPI = round(np.sqrt(mean_squared_error(y_valid, y_pred)), 5)
print(f'Validation RMSE: {rmse_KPI}')

# Calculer le MAE
mae_KPI = round(mean_absolute_error(y_valid, y_pred), 5)
print(f'Validation MAE: {mae_KPI}')

## Prédiction du modèle

In [68]:
# Convert y_pred (NumPy array) to a DataFrame
y_pred_df = pd.DataFrame(y_pred)

# Save the DataFrame to a CSV file
y_pred_df.to_csv('y_pred.csv', index=False)

## Sauvegarde du modèle

In [69]:
import joblib
import torch
import json

# Save Everything

# Save model weights
torch.save(model.state_dict(), "modele.pth")

# Save scalers
joblib.dump(x_scaler, "x_scaler.pkl")
joblib.dump(y_scaler, "y_scaler.pkl")

# Save model parameters (input_size, hidden_size, output_size)
params = {
    "input_size": input_size,
    "hidden_size": hidden_size,
    "output_size": output_size,
    "kpi": {
        "mae": mae_KPI,
        "nrmse": nrmse_KPI,
        "rmse": rmse_KPI
    }
}
with open("modele_parametres.json", "w") as f:
    json.dump(params, f)

In [70]:
#Check the model saved

# Load Everything Later

# Load model parameters
with open("modele_parametres.json", "r") as f:
    params = json.load(f)

loaded_input_size = params["input_size"]
loaded_hidden_size = params["hidden_size"]
loaded_output_size = params["output_size"]

# Reconstruct model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = BiLSTMModel(loaded_input_size, loaded_hidden_size, loaded_output_size).to(device)

# Load model weights
loaded_model.load_state_dict(torch.load("modele.pth", map_location=device))
loaded_model.eval()  # Set model to evaluation mode

# Load scalers
loaded_x_scaler = joblib.load("x_scaler.pkl")
loaded_y_scaler = joblib.load("y_scaler.pkl")

  loaded_model.load_state_dict(torch.load("modele.pth", map_location=device))


In [71]:
# Perform Inference on New Data

x_valid_restandardized = loaded_x_scaler.transform(x_valid)
x_valid_retensor = torch.tensor(x_valid_restandardized, dtype=torch.float32).to(device)

# Ensure x_valid_tensor is on the same device
x_valid_retensor = x_valid_retensor.to(device)

# Perform inference to forecast the next x steps
pred_model_loaded = loaded_model(x_valid_retensor)

# Move to CPU before converting to NumPy
pred_model_loaded = pred_model_loaded.cpu().detach().numpy()

# Use the scaler to inverse transform
pred_model_loaded = loaded_y_scaler.inverse_transform(pred_model_loaded).astype(int)