In [2]:
import sys
path_to_challenge_sncf_2024 = '/Users/gurvanrichardeau/Python_projects/LocalCodes/challenge_sncf_2024'
sys.path.append(path_to_challenge_sncf_2024 + '/utils')

from PandasToolsFunction import *
from DataPreprocessingTools import *

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm


import json

import matplotlib.pyplot as plt

In [3]:
# Calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    threshold = 0.4
    y_true_floor = np.maximum(y_true, threshold)
    return np.mean(np.abs((y_true_floor - y_pred) / y_true_floor))

In [4]:
# Define the linear regression model
class LinearRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

In [10]:
class RegressorModelPytorch():
    def __init__(self) -> None:

        self.submission = None
        
    def preprocessing_data(self, train_data: pd.DataFrame, test_size = 0.01, station_col = 'station', features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y' ):
        
        self.features_col = features_col
        self.label_col = label_col

        self.features_dim = len(features_col)

        self.stations = train_data[station_col].unique()

        # Creating one model per station. Abandonned because will not be optimized in memory.
        # self.models = {station : LinearRegressionModel(features_dim) for station in stations } 
         
        # We need to use only one scaler for every station, we fit on one station and we then reuse it for every other. 
        # Be careful to use the same for testing than for training, indeed, as there are not as many weeks on the test, set, the scaling would be different
        # if it is made on the test.set
        self.scaler = MinMaxScaler()

        # Training the scaler once and for all :

        mask = (train_data['station'] == self.stations[0])
        features = train_data[mask][features_col]
        X_normalized = self.scaler.fit_transform(features)
        
        self.datasets = {} # each dataset will contain four sets : X_train, X_test, Y_train, Y_test

        for station in self.stations :
            station_indices = train_data.index[train_data[station_col] == station]
            features = train_data.loc[station_indices, features_col]
            y = train_data.loc[station_indices, label_col]

            X_normalized = self.scaler.transform(features)

            self.datasets[station] = train_test_split(X_normalized, y, test_size=test_size, shuffle=True)

    def train(self, epochs = 10, lr = 0.01) : #, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y'):
        # Creating a linear regression model
        
        #self.preprocessing_data(train_data, features_col, label_col)

        for station in tqdm(self.stations, desc="Stations"):
            X_train, X_test, y_train, y_test = self.datasets[station]

            model = LinearRegressionModel(self.features_dim)
            optimizer = optim.SGD(model.parameters(), lr=lr)

            for epoch in tqdm(epochs, desc= f"station : {station}, epochs :") :
                # Training the model
                

                # Making predictions on the test set
                y_pred = self.model(X_test)

                # Evaluating the model
                mape = mean_absolute_percentage_error(y_test, y_pred)

                print(f'MAPE score: {mape}')

                # Print the coefficients of the model
                print('Coefficients:', self.model.coef_)

 
    def test(self, test_data_init: pd.DataFrame):
        
        for model in self.models : 
            model.eval_mode()

        test_data = test_data_init.copy()

        test_data_features = test_data[self.features_col].values

        # Careful using the same scaler that was fitted on the training set
        X_test_normalized = self.scaler.transform(test_data_features)

        y_pred = self.model.predict(X_test_normalized)
        
        y_pred = np.maximum(y_pred, 0)
        test_data[self.label_col] = y_pred
        self.submission = test_data[['index', 'y']]
        return test_data
    
    def save(self, name = "avg_model"):
        pass

    def load(self, name="avg_model"):
       pass

    def save_submission(self, name = "submission"):
        self.submission.to_csv("../submissions/" + name + ".csv", index=False)    

In [55]:
class RegressorModelOneStationsklearn():
    def __init__(self, ) -> None:

        self.submission = None
        self.model = LinearRegression()

    def preprocessing_data(self, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y' ):
        
        self.features_col = features_col
        self.label_col = label_col

        features = train_data[features_col]
        
        y = train_data[label_col]

        # Normalize the features using MinMaxScaler
        self.scaler = MinMaxScaler()
        X_normalized = self.scaler.fit_transform(features)

        # print(f"{X_normalized = }") uncomment to see what MinMaxScaler does.

        # Splitting the data into training and testing sets
        self.datasets = train_test_split(X_normalized, y, test_size=0.2, shuffle=True)

    def linear_regression_train(self) : #, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y'):
        # Creating a linear regression model
        
        #self.preprocessing_data(train_data, features_col, label_col)

        X_train, X_test, y_train, y_test = self.datasets

        # Training the model
        self.model.fit(X_train, y_train)

        # Making predictions on the test set
        y_pred = self.model.predict(X_test)

        # Evaluating the model
        mape = mean_absolute_percentage_error(y_test, y_pred)

        print(f'MAPE score: {mape}')

        # Print the coefficients of the model
        print('Coefficients:', self.model.coef_)

    def train(self, mode='linear_regression'):

        if mode == 'linear_regression':
            self.linear_regression_train()
        else :
            pass
 
    def test(self, test_data_init: pd.DataFrame):
        
        test_data = test_data_init.copy()

        test_data_features = test_data[self.features_col].values

        # Careful using the same scaler that was fitted on the training set
        X_test_normalized = self.scaler.transform(test_data_features)

        y_pred = self.model.predict(X_test_normalized)
        
        y_pred = np.maximum(y_pred, 0)
        test_data[self.label_col] = y_pred
        self.submission = test_data[['index', 'y']]
        return test_data
    
    def save(self, name = "avg_model"):
        pass

    def load(self, name="avg_model"):
       pass

    def save_submission(self, name = "submission"):
        self.submission.to_csv("../submissions/" + name + ".csv", index=False)    

In [7]:
# Charger les données d'entraînement
train_data = pd.read_csv('../data/train_data_day_typed_cov_replaced_all_features.csv')
train_data['date'] = pd.to_datetime(train_data['date'])

# Work only on 2019-2022 included
train_data = date_filter(train_data, start_date='2019-12-31', end_date='2022-12-31')

#one_st_filter = make_station_filter(train_data, stations=['1J7'])
#train_data_one_station = train_data[one_st_filter]

In [12]:
train_data['1J7']

KeyError: '1J7'

In [None]:
tr

In [11]:
regressor = RegressorModelPytorch()
regressor.preprocessing_data(train_data)


dict_keys(['1J7', 'O2O', 'NZV', '8QR', 'UMC', 'FK3', 'I1K', 'KY6', 'V8D', 'EK4', '357', 'O7B', 'RYX', 'TEP', 'CUF', '4UY', 'I9K', 'FA8', 'EPD', 'VML', 'GV0', '5KB', '6WR', 'BRZ', 'BUY', 'C3A', '276', 'IFB', '1PW', '6UW', 'RR1', 'XTG', 'JXV', 'OD3', 'YTV', 'Z2T', 'GOJ', '8X9', 'R83', 'XIG', 'CSD', '30R', 'OOY', 'JCN', 'G10', 'MM6', 'NT4', 'IOR', 'VI4', '8WZ', 'FYB', '5GK', 'QQJ', 'E9M', 'F77', 'JX6', 'HW6', '30I', 'CWQ', '4IM', '59A', 'BZ0', '5S5', 'MK9', 'JCW', '34L', 'EBP', 'V4D', '3KU', 'BNR', 'VGQ', 'B3W', 'HOX', '9WY', 'N7J', '4YU', 'JD3', 'RF2', 'FBH', 'P4J', 'QV8', 'SH9', 'DBV', '3NQ', '651', 'HOC', '3W6', 'EIZ', '87K', '289', 'XRB', 'BKJ', 'T91', 'NVE', 'CJP', '5U5', 'BPW', 'GKH', 'XOK', '72S', '4PO', '5PK', '5WG', 'J9O', '6DU', 'MIH', 'DSX', 'F7I', 'P3G', 'B3D', 'B61', '8E0', '7NG', '222', '1TE', 'CYD', 'GN3', '11W', '4SH', '3NS', 'OTS', '25G', 'XUL', 'Z5T', '56P', 'WIX', 'F30', 'OUG', 'APN', 'WRR', 'K5Q', 'D68', '2P1', 'B65', 'ROF', '1EJ', 'H1M', '1I8', 'JP8', '2PZ', 'KQC', 'Z

In [13]:
test_data = pd.read_csv('../data/test_data_day_typed_cov_replaced_all_features.csv')
test_data['date'] = pd.to_datetime(test_data['date'])
one_st_filter2 = make_station_filter(test_data, stations=['1J7'])
test_data_one_station = test_data[one_st_filter2]
test_data_one_station

Unnamed: 0,index,date,week,day_name,day_numeric,station,day_type
0,2023-01-01_1J7,2023-01-01,52,Sunday,6,1J7,7
432,2023-02-01_1J7,2023-02-01,5,Wednesday,2,1J7,0
870,2023-03-01_1J7,2023-03-01,9,Wednesday,2,1J7,1
1307,2023-04-01_1J7,2023-04-01,13,Saturday,5,1J7,0
1737,2023-05-01_1J7,2023-05-01,18,Monday,0,1J7,8
...,...,...,...,...,...,...,...
76483,2023-05-30_1J7,2023-05-30,22,Tuesday,1,1J7,0
76919,2023-06-30_1J7,2023-06-30,26,Friday,4,1J7,0
77356,2023-01-31_1J7,2023-01-31,5,Tuesday,1,1J7,0
77777,2023-03-31_1J7,2023-03-31,13,Friday,4,1J7,0


In [57]:
result = regressor.test(test_data_init=test_data_one_station)
result



Unnamed: 0,index,date,week,day_name,day_numeric,station,day_type,y
0,2023-01-01_1J7,2023-01-01,52,Sunday,6,1J7,7,0.000000
432,2023-02-01_1J7,2023-02-01,5,Wednesday,2,1J7,0,159.875167
870,2023-03-01_1J7,2023-03-01,9,Wednesday,2,1J7,1,150.028338
1307,2023-04-01_1J7,2023-04-01,13,Saturday,5,1J7,0,84.714748
1737,2023-05-01_1J7,2023-05-01,18,Monday,0,1J7,8,130.474052
...,...,...,...,...,...,...,...,...
76483,2023-05-30_1J7,2023-05-30,22,Tuesday,1,1J7,0,185.984068
76919,2023-06-30_1J7,2023-06-30,26,Friday,4,1J7,0,110.608986
77356,2023-01-31_1J7,2023-01-31,5,Tuesday,1,1J7,0,185.071749
77777,2023-03-31_1J7,2023-03-31,13,Friday,4,1J7,0,109.911330


In [58]:
regressor.submission

Unnamed: 0,index,y
0,2023-01-01_1J7,0.000000
432,2023-02-01_1J7,159.875167
870,2023-03-01_1J7,150.028338
1307,2023-04-01_1J7,84.714748
1737,2023-05-01_1J7,130.474052
...,...,...
76483,2023-05-30_1J7,185.984068
76919,2023-06-30_1J7,110.608986
77356,2023-01-31_1J7,185.071749
77777,2023-03-31_1J7,109.911330
