In [1]:
from utils.PandasToolsFunction import *
from utils.DataPreprocessingTools import *
from utils.loss_functions import mean_absolute_percentage_error

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

import json

import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
def train_data_station_split(train_data_init: pd.DataFrame, size = 50, station_col ='station'):
    train_data = train_data_init.copy()

    train_data.sort_values(station_col, inplace=True)
    stations = train_data[station_col].unique()
    total_stations = len(stations)

    station_count =0
    split_dataframes = []
    end = False
    
    while not end :
        if station_count + size <= total_stations :
            current_stations = stations[station_count: station_count + size]
        else : 
            current_stations = stations[station_count:]
            end = True

        station_count += size
        stations_filter = make_station_filter(train_data, current_stations)
        current_dataframe = train_data[stations_filter]
        split_dataframes.append(current_dataframe)

    return split_dataframes

In [4]:
# Define the linear regression model
class LinearRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)
    
# Define the MLP regression model
class MLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim=1, n_layers = 2):
        super(MLP, self).__init__()
        self.input_layer = torch.nn.Linear(input_dim, hidden_dim)
        self.hidden_layers = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for _ in range(n_layers - 1)])
        self.output_layer = torch.nn.Linear(hidden_dim, output_dim)
        self.activation = torch.nn.ReLU()
    
    def forward(self, x):
        x = self.activation(self.input_layer(x))
        for layer in self.hidden_layers:
            x = self.activation(layer(x))
        return self.output_layer(x)

In [5]:
class NeuralNetworkModelPytorch():
    """
    Split data per stations to not overcharge memory.
    """

    def __init__(self, train_data: pd.DataFrame, size = 50, model_type='MLP', hidden_dim =32, n_layers =2, station_col = 'station', features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y') -> None:

        self.submission = None

        self.model_type = model_type
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.train_data = train_data
        self.station_col = station_col
        self.features_col = features_col
        self.label_col = label_col

        self.features_dim = len(features_col)

        self.setting_scaler(train_data)

        # We split the training date per subsets of "size" stations each.
        self.split_dataframes = train_data_station_split(self.train_data, size, station_col)
    
    def setting_scaler(self, train_data: pd.DataFrame):
                
        # We need to use only one scaler for every station, we fit on one station and we then reuse it for every other. 
        # Be careful to use the same for testing than for training, indeed, as there are not as many weeks on the test, set, the scaling would be different
        # if it is made on the test.set

        self.scaler = MinMaxScaler()
        # Training the scaler once and for all :

        mask = (train_data['station'] == train_data[self.station_col].unique()[0])
        features = train_data[mask][self.features_col]
        _ = self.scaler.fit_transform(features)

    def preprocessing_data(self, small_train_data: pd.DataFrame, test_size = 0.01, lr= 0.01, Optim = optim.SGD):

        stations = small_train_data[self.station_col].unique()

        # Creating one model per station. 
        if self.model_type == 'MLP':
            modules=[MLP(self.features_dim, hidden_dim=self.hidden_dim, n_layers=self.n_layers) for _ in range(len(stations))]
        
        self.models = {station : (module, Optim(params=module.parameters(), lr=lr) ) for station, module in zip(stations, modules) } 
        self.datasets = {} # each dataset will contain four sets : X_train, X_test, Y_train, Y_test

        # creating datasets

        for station in stations :

            station_indices = small_train_data.index[small_train_data[self.station_col] == station]
            features = small_train_data.loc[station_indices, self.features_col]
            y = small_train_data.loc[station_indices, self.label_col]
            # print("TYPE Y", type(y)) # pandas.Series
            y = np.array(y)

            X_normalized = self.scaler.transform(features)

            self.datasets[station] = train_test_split(X_normalized, y, test_size=test_size, random_state= 42, shuffle=False)

    def train(self, epochs = 10, test_size = 0.01, lr = 0.01, Optim = optim.SGD, print_period: int = 1000, threshold = 0.1) : #, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y'):

        for k, small_train_data in tqdm(enumerate(self.split_dataframes), desc="Split dataframes") :
            
            self.preprocessing_data(small_train_data, test_size, lr, Optim)

            for station, model in tqdm(self.models.items(), desc="Stations"):
                
                module, optimizer = model
                X_train, X_test, y_train, y_test = self.datasets[station]

                scheduler = StepLR(optimizer,
                   step_size = 1,
                   gamma = 0.5)

                scheduler_period = 10

                # Converting to tensors and putting them on the good device
                X_train, X_test = torch.Tensor(X_train, device=device), torch.Tensor(X_test, device=device)
                y_train, y_test = torch.Tensor(y_train, device=device), torch.Tensor(y_test, device=device)  
                
                module = module.to(device)
                
                #print(X_train)

                for epoch in tqdm(range(epochs), desc=f"Station: {station}, epochs:", leave=False):
                    # Training the model

                    optimizer.zero_grad()
                    # Making predictions on the test set
                    y_pred = module(X_train)

                    y_pred = torch.maximum(y_pred, torch.tensor(threshold))
                    # Evaluating the model
                    loss = mean_absolute_percentage_error(y_train, y_pred, threshold)
                    loss.backward()
                    optimizer.step()
                    
                    # Update the progress bar with the current loss
                    tqdm.write(f'Station: {station}, dataset_size = {len(X_train)}, Epoch {epoch}, Loss: {loss:.4f}', end='\r') # line 1
                    #tqdm.write(f'{loss}', end='\r') # line 1
                    
                    if (epoch+1) % scheduler_period == 0 :
                        pass #scheduler.step()
                
                ## Code score on test_set and maybe something that takes best model like in ALTEGRAD project. ##
                
                test_pred = module(X_test)
                print('Score on test set :', mean_absolute_percentage_error(y_test, test_pred, threshold) )
            #self.save_small_data(self, path= f"{k}")
            break

 
    def test(self, test_data_init: pd.DataFrame):
        
        for model in self.models : 
            model.eval_mode()

        test_data = test_data_init.copy()

        test_data_features = test_data[self.features_col].values

        # Careful using the same scaler that was fitted on the training set
        X_test_normalized = self.scaler.transform(test_data_features)

        y_pred = self.model.predict(X_test_normalized)
        
        y_pred = np.maximum(y_pred, 0)
        test_data[self.label_col] = y_pred
        self.submission = test_data[['index', 'y']]
        return test_data
    
    """def save_small_data(self, model: nn.Module, name = "avg_model"):
        print('validation loss improved saving checkpoint...')
        save_path =
        
        torch.save({
        'epoch': i,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'validation_accuracy': val_loss,
        'loss': loss,
        }, save_path)"""

    def load(self, name="avg_model"):
       pass

    def save_submission(self, name = "submission"):
        self.submission.to_csv("../submissions/" + name + ".csv", index=False)    

In [7]:
# Charger les données d'entraînement
train_data = pd.read_csv('data/train_data_day_typed_cov_replaced_all_features.csv')
train_data['date'] = pd.to_datetime(train_data['date'])

# Work only on 2019-2022 included
train_data = date_filter(train_data, start_date='2019-12-31', end_date='2022-12-31')

#one_st_filter = make_station_filter(train_data, stations=['1J7'])
#train_data_one_station = train_data[one_st_filter]

In [9]:
regressor = NeuralNetworkModelPytorch(train_data, size=10, hidden_dim=512, n_layers=2)
regressor.train(epochs= int(1e3), Optim=optim.Adam, lr=3e-4)

Split dataframes: 0it [00:00, ?it/s]

Stations:   0%|          | 0/10 [00:00<?, ?it/s]

Station: 003, epochs::   0%|          | 0/1000 [00:00<?, ?it/s]

Station: 003, dataset_size = 782, Epoch 945, Loss: 379.2182

RuntimeError: cannot release un-acquired lock

In [None]:
test_data = pd.read_csv('data/test_data_day_typed_cov_replaced_all_features.csv')
test_data['date'] = pd.to_datetime(test_data['date'])
one_st_filter2 = make_station_filter(test_data, stations=['1J7'])
test_data_one_station = test_data[one_st_filter2]
test_data_one_station

Unnamed: 0,index,date,week,day_name,day_numeric,station,day_type
0,2023-01-01_1J7,2023-01-01,52,Sunday,6,1J7,7
432,2023-02-01_1J7,2023-02-01,5,Wednesday,2,1J7,0
870,2023-03-01_1J7,2023-03-01,9,Wednesday,2,1J7,1
1307,2023-04-01_1J7,2023-04-01,13,Saturday,5,1J7,0
1737,2023-05-01_1J7,2023-05-01,18,Monday,0,1J7,8
...,...,...,...,...,...,...,...
76483,2023-05-30_1J7,2023-05-30,22,Tuesday,1,1J7,0
76919,2023-06-30_1J7,2023-06-30,26,Friday,4,1J7,0
77356,2023-01-31_1J7,2023-01-31,5,Tuesday,1,1J7,0
77777,2023-03-31_1J7,2023-03-31,13,Friday,4,1J7,0


In [None]:
result = regressor.test(test_data_init=test_data_one_station)
result

AttributeError: 'str' object has no attribute 'eval_mode'

In [None]:
regressor.submission

Unnamed: 0,index,y
0,2023-01-01_1J7,0.000000
432,2023-02-01_1J7,159.875167
870,2023-03-01_1J7,150.028338
1307,2023-04-01_1J7,84.714748
1737,2023-05-01_1J7,130.474052
...,...,...
76483,2023-05-30_1J7,185.984068
76919,2023-06-30_1J7,110.608986
77356,2023-01-31_1J7,185.071749
77777,2023-03-31_1J7,109.911330
