# Mean estimator

Let $s^*$ be a given station and $d^*$ a given data. Let $\text{job}(d)$ be a boolean being true if $d \in \{ \text{Lundi, Mardi, Mercredi, Jeudi, Vendredi} \}$ and false if $d \in \{ \text{Samedi, Dimanche} \}$. We estimate the number of validation $v_{s^*,d^*}$ by $\hat{v}_{s^*,d^*} = \frac{1}{\sum_{d \in D} \textbf{1}_{\text{job}(d) = \text{job}(d^*)}} \sum_{d \in D} \textbf{1}_{\text{job}(d) = \text{job}(d^*)} v_{s, d}$, where $D$ is the set of available dates for the station $s^*$.

## Imports

In [66]:
# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
from tqdm import tqdm

In [67]:
# our code
from utils import load_train, mape

## Loading the data

In [101]:
x_train_data, y_train_data = load_train('data/x_train.csv', 'data/y_train.csv')

In [102]:
x_train_data

Unnamed: 0,date,station,job,ferie,vacances
0,2015-01-01,1J7,1,1,1
1,2015-01-01,O2O,1,1,1
2,2015-01-01,8QR,1,1,1
3,2015-01-01,UMC,1,1,1
4,2015-01-01,FK3,1,1,1
...,...,...,...,...,...
1229858,2022-12-31,V2P,0,0,1
1229859,2022-12-31,N9K,0,0,1
1229860,2022-12-31,P6E,0,0,1
1229861,2022-12-31,BDC,0,0,1


In [109]:
y_train_data

Unnamed: 0,index,y
0,2015-01-01_1J7,7
1,2015-01-01_O2O,0
2,2015-01-01_8QR,9
3,2015-01-01_UMC,9
4,2015-01-01_FK3,28
...,...,...
1229858,2022-12-31_V2P,1227
1229859,2022-12-31_N9K,544
1229860,2022-12-31_P6E,92
1229861,2022-12-31_BDC,91


In [94]:
x_train_data.drop('date', axis=1, inplace=True)
x_train_data.drop('ferie', axis=1, inplace=True)
x_train_data.drop('vacances', axis=1, inplace=True)
#x_train_data = x_train_data.values
x_train_data

array([['1J7', 1],
       ['O2O', 1],
       ['8QR', 1],
       ...,
       ['P6E', 0],
       ['BDC', 0],
       ['W14', 0]], dtype=object)

In [95]:
y_train_data.drop('index', axis=1, inplace=True)
y_train_data = y_train_data.values.flatten()
y_train_data

array([ 7,  0,  9, ..., 92, 91, 18])

## Mean function

In [116]:
def compute_mean(station, job):
    sum = 0
    nb = 0
    for i, row in x_train_data.iterrows():
        # i contains the index of the current row
        # row contains the data of the current row as a pandas Series
        if (row['station'] == station) and (row['job'] == job):
            sum += y_train_data['y'].values[i]
            nb += 1
    return sum/nb

## Precomputations
We precompute the mean of validations for each station for each type of day (job or not job)

In [117]:
class Means:
    def __init__(self):
        self.data = {}

    def add_entry(self, key, value):
        if key not in self.data:
            self.data[key] = [None, None]
        self.data[key][0] = value[0]
        self.data[key][1] = value[1]

    def __getitem__(self, key):
        return self.data[key]

    def __setitem__(self, key, value):
        self.add_entry(key, value)

In [118]:
means = Means()
for station in tqdm(x_train_data['station'].unique()):
    means[station] = (compute_mean(station, 0), compute_mean(station, 1))
means

  0%|          | 0/439 [00:00<?, ?it/s]

  0%|          | 0/439 [00:37<?, ?it/s]


KeyboardInterrupt: 

In [76]:
def get_mean(line_station, line_job):
    # returns the mean of validation for the station line['station'] over the dates that verify line['job']
    select_station = (x_train_data['station'] == line_station)
    select_job = (x_train_data['job'] == line_job)
    select = np.logical_and(select_station, select_job)

    # not possible to do: selected_lines = y_train_data.iloc[select]
    #selected_lines = pd.DataFrame()
    sum = 0
    nb = 0
    for i in range(y_train_data.size):
        if select[i]:
            """
            print("line['job']:",line['job'])
            print("line['station']:",line['station'])
            print("x_train_data.iloc[i]['job']:",x_train_data.iloc[i]['job'])
            print("x_train_data.iloc[i]['station']:",x_train_data.iloc[i]['station'])
            selected_lines.add(y_train_data.iloc[i])
            """
            nb += 1
            sum += y_train_data['y'].values[i]
    return sum/nb

## Trying it on a validation set

In [77]:
# Split data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(x_train_data, y_train_data, test_size=0.01, random_state=42)

In [82]:
y_pred = np.zeros_like(y_val)
print("Number of iteration:", len(X_val))
for i, x in tqdm(enumerate(X_val)):
    y_pred[i] = compute_mean(x[0], x[1])

Number of iteration: 12299


66it [00:14,  4.40it/s]


KeyboardInterrupt: 

In [83]:
# Predict on the testing set
y_pred

array([  54, 3789,  381, ...,    0,    0,    0])

In [84]:

y_val

array([  92, 3580,  354, ...,   85, 9061,  461])

## Evaluating the prediction on a validation set

In [None]:
# Calculate Mean Squared Error
print('Validation MAPE:', mape(y_val, y_pred))

Validation MAPE: 0.1490748127951265


## Prediction on the test set

In [24]:
# Load and reformat the testing input x_test from CSV file
x_test = pd.read_csv('data/x_test.csv')
x_test.drop(columns=['index'], inplace=True)
x_test['date'] = pd.to_datetime(x_test['date'])
# NE PAS FAIRE reference_date = x_test['date'][0]
print("reference_date: ", reference_date)
x_test['date'] = (x_test['date'] - reference_date).dt.days
x_test['station'], unique_ids = pd.factorize(x_test['station'])
x_test

reference_date:  2015-01-01 00:00:00


Unnamed: 0,date,station,job,ferie,vacances
0,2922,0,0,1,1
1,2922,1,0,1,1
2,2922,2,0,1,1
3,2922,3,0,1,1
4,2922,4,0,1,1
...,...,...,...,...,...
78647,3072,428,1,0,0
78648,3072,429,1,0,0
78649,3072,430,1,0,0
78650,3072,438,1,0,0


In [25]:
# Initialize linear regression model
model = LinearRegression()

# Train the model on the entire x and y training data
model.fit(x_train_data, y_train_data)

# Prediction and test values
y_test_pred = model.predict(x_test)
y_test_pred

array([[-4824.96319729],
       [-4805.9670311 ],
       [-4786.97086491],
       ...,
       [ 8880.37354046],
       [ 9032.34286996],
       [ 8899.36970665]])

## Reformating for submission

In [26]:
y_submit = pd.read_csv('data/submission_example.csv')
y_submit

Unnamed: 0,index,y
0,2023-01-01_1J7,10000
1,2023-01-01_O2O,10000
2,2023-01-01_8QR,10000
3,2023-01-01_L58,10000
4,2023-01-01_UMC,10000
...,...,...
78647,2023-05-31_N9K,10000
78648,2023-05-31_P6E,10000
78649,2023-05-31_BDC,10000
78650,2023-05-31_QD6,10000


In [27]:
y_submit['y'] = y_test_pred
y_submit

Unnamed: 0,index,y
0,2023-01-01_1J7,-4824.963197
1,2023-01-01_O2O,-4805.967031
2,2023-01-01_8QR,-4786.970865
3,2023-01-01_L58,-4767.974699
4,2023-01-01_UMC,-4748.978533
...,...,...
78647,2023-05-31_N9K,8842.381208
78648,2023-05-31_P6E,8861.377374
78649,2023-05-31_BDC,8880.373540
78650,2023-05-31_QD6,9032.342870


In [None]:
filename = 'submissions/new_submission.csv'
y_submit.to_csv(filename, index=False)