In [2]:
import sys
path_to_challenge_sncf_2024 = '/Users/gurvanrichardeau/Python_projects/LocalCodes/challenge_sncf_2024'
sys.path.append(path_to_challenge_sncf_2024 + '/utils')

from PandasToolsFunction import *
from DataPreprocessingTools import *

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm


import json

import matplotlib.pyplot as plt

In [3]:
# Calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    threshold = 0.4
    y_true_floor = np.maximum(y_true, threshold)
    return np.mean(np.abs((y_true_floor - y_pred) / y_true_floor))

In [None]:
# Define the linear regression model
class LinearRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear(x)

In [None]:
class RegressorModelPytorch():
    def __init__(self, ) -> None:

        self.submission = None
        self.model = LinearRegressionModel()

    def preprocessing_data(self, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y' ):
        
        self.features_col = features_col
        self.label_col = label_col

        features = train_data[features_col]
        
        y = train_data[label_col]

        # Normalize the features using MinMaxScaler
        self.scaler = MinMaxScaler()
        X_normalized = self.scaler.fit_transform(features)

        # print(f"{X_normalized = }") uncomment to see what MinMaxScaler does.

        # Splitting the data into training and testing sets
        self.datasets = train_test_split(X_normalized, y, test_size=0.2, shuffle=True)

    def linear_regression_train(self) : #, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y'):
        # Creating a linear regression model
        
        #self.preprocessing_data(train_data, features_col, label_col)

        X_train, X_test, y_train, y_test = self.datasets

        # Training the model
        self.model.fit(X_train, y_train)

        # Making predictions on the test set
        y_pred = self.model.predict(X_test)

        # Evaluating the model
        mape = mean_absolute_percentage_error(y_test, y_pred)

        print(f'MAPE score: {mape}')

        # Print the coefficients of the model
        print('Coefficients:', self.model.coef_)

    def train(self, mode='linear_regression'):

        if mode == 'linear_regression':
            self.linear_regression_train()
        else :
            pass
 
    def test(self, test_data_init: pd.DataFrame):
        
        test_data = test_data_init.copy()

        test_data_features = test_data[self.features_col].values

        # Careful using the same scaler that was fitted on the training set
        X_test_normalized = self.scaler.transform(test_data_features)

        y_pred = self.model.predict(X_test_normalized)
        
        y_pred = np.maximum(y_pred, 0)
        test_data[self.label_col] = y_pred
        self.submission = test_data[['index', 'y']]
        return test_data
    
    def save(self, name = "avg_model"):
        pass

    def load(self, name="avg_model"):
       pass

    def save_submission(self, name = "submission"):
        self.submission.to_csv("../submissions/" + name + ".csv", index=False)    

In [55]:
class RegressorModelOneStationsklearn():
    def __init__(self, ) -> None:

        self.submission = None
        self.model = LinearRegression()

    def preprocessing_data(self, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y' ):
        
        self.features_col = features_col
        self.label_col = label_col

        features = train_data[features_col]
        
        y = train_data[label_col]

        # Normalize the features using MinMaxScaler
        self.scaler = MinMaxScaler()
        X_normalized = self.scaler.fit_transform(features)

        # print(f"{X_normalized = }") uncomment to see what MinMaxScaler does.

        # Splitting the data into training and testing sets
        self.datasets = train_test_split(X_normalized, y, test_size=0.2, shuffle=True)

    def linear_regression_train(self) : #, train_data: pd.DataFrame, features_col = ['week', 'day_numeric', 'day_type'], label_col = 'y'):
        # Creating a linear regression model
        
        #self.preprocessing_data(train_data, features_col, label_col)

        X_train, X_test, y_train, y_test = self.datasets

        # Training the model
        self.model.fit(X_train, y_train)

        # Making predictions on the test set
        y_pred = self.model.predict(X_test)

        # Evaluating the model
        mape = mean_absolute_percentage_error(y_test, y_pred)

        print(f'MAPE score: {mape}')

        # Print the coefficients of the model
        print('Coefficients:', self.model.coef_)

    def train(self, mode='linear_regression'):

        if mode == 'linear_regression':
            self.linear_regression_train()
        else :
            pass
 
    def test(self, test_data_init: pd.DataFrame):
        
        test_data = test_data_init.copy()

        test_data_features = test_data[self.features_col].values

        # Careful using the same scaler that was fitted on the training set
        X_test_normalized = self.scaler.transform(test_data_features)

        y_pred = self.model.predict(X_test_normalized)
        
        y_pred = np.maximum(y_pred, 0)
        test_data[self.label_col] = y_pred
        self.submission = test_data[['index', 'y']]
        return test_data
    
    def save(self, name = "avg_model"):
        pass

    def load(self, name="avg_model"):
       pass

    def save_submission(self, name = "submission"):
        self.submission.to_csv("../submissions/" + name + ".csv", index=False)    

In [9]:
# Charger les données d'entraînement
train_data = pd.read_csv('../data/train_data_day_typed_cov_replaced_all_features.csv')
train_data['date'] = pd.to_datetime(train_data['date'])

# Work only on 2019-2022 included
train_data = date_filter(train_data, start_date='2019-12-31', end_date='2022-12-31')

one_st_filter = make_station_filter(train_data, stations=['1J7'])
train_data_one_station = train_data[one_st_filter]

In [10]:
train_data_one_station

Unnamed: 0,date,week,day_name,day_numeric,station,y,day_type
2124,2022-01-01,52,Saturday,5,1J7,26,7
4685,2022-02-01,5,Tuesday,1,1J7,166,0
7256,2022-03-01,9,Tuesday,1,1J7,170,1
9810,2022-04-01,13,Friday,4,1J7,146,0
12372,2022-05-01,17,Sunday,6,1J7,20,8
...,...,...,...,...,...,...,...
1097444,2020-05-31,22,Tuesday,1,1J7,153,1
1097872,2020-07-31,30,Sunday,6,1J7,51,1
1098279,2020-08-31,35,Wednesday,2,1J7,169,1
1098717,2020-10-31,44,Monday,0,1J7,129,1


In [56]:
regressor = RegressorModelsklearn()
regressor.preprocessing_data(train_data_one_station)
regressor.train(mode='linear_regression')

MAPE score: 0.49937413799727143
Coefficients: [   2.73695746 -151.1794905  -160.9838683 ]


In [13]:
test_data = pd.read_csv('../data/test_data_day_typed_cov_replaced_all_features.csv')
test_data['date'] = pd.to_datetime(test_data['date'])
one_st_filter2 = make_station_filter(test_data, stations=['1J7'])
test_data_one_station = test_data[one_st_filter2]
test_data_one_station

Unnamed: 0,index,date,week,day_name,day_numeric,station,day_type
0,2023-01-01_1J7,2023-01-01,52,Sunday,6,1J7,7
432,2023-02-01_1J7,2023-02-01,5,Wednesday,2,1J7,0
870,2023-03-01_1J7,2023-03-01,9,Wednesday,2,1J7,1
1307,2023-04-01_1J7,2023-04-01,13,Saturday,5,1J7,0
1737,2023-05-01_1J7,2023-05-01,18,Monday,0,1J7,8
...,...,...,...,...,...,...,...
76483,2023-05-30_1J7,2023-05-30,22,Tuesday,1,1J7,0
76919,2023-06-30_1J7,2023-06-30,26,Friday,4,1J7,0
77356,2023-01-31_1J7,2023-01-31,5,Tuesday,1,1J7,0
77777,2023-03-31_1J7,2023-03-31,13,Friday,4,1J7,0


In [57]:
result = regressor.test(test_data_init=test_data_one_station)
result



Unnamed: 0,index,date,week,day_name,day_numeric,station,day_type,y
0,2023-01-01_1J7,2023-01-01,52,Sunday,6,1J7,7,0.000000
432,2023-02-01_1J7,2023-02-01,5,Wednesday,2,1J7,0,159.875167
870,2023-03-01_1J7,2023-03-01,9,Wednesday,2,1J7,1,150.028338
1307,2023-04-01_1J7,2023-04-01,13,Saturday,5,1J7,0,84.714748
1737,2023-05-01_1J7,2023-05-01,18,Monday,0,1J7,8,130.474052
...,...,...,...,...,...,...,...,...
76483,2023-05-30_1J7,2023-05-30,22,Tuesday,1,1J7,0,185.984068
76919,2023-06-30_1J7,2023-06-30,26,Friday,4,1J7,0,110.608986
77356,2023-01-31_1J7,2023-01-31,5,Tuesday,1,1J7,0,185.071749
77777,2023-03-31_1J7,2023-03-31,13,Friday,4,1J7,0,109.911330


In [58]:
regressor.submission

Unnamed: 0,index,y
0,2023-01-01_1J7,0.000000
432,2023-02-01_1J7,159.875167
870,2023-03-01_1J7,150.028338
1307,2023-04-01_1J7,84.714748
1737,2023-05-01_1J7,130.474052
...,...,...
76483,2023-05-30_1J7,185.984068
76919,2023-06-30_1J7,110.608986
77356,2023-01-31_1J7,185.071749
77777,2023-03-31_1J7,109.911330
