# First try: linear regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

## Metric

In [2]:
def mape(actual, forecast, ignore=False):
    """
    Compute the Mean Absolute Percentage Error (MAPE)
    
    Parameters:
    actual (numpy array): Array of actual values
    forecast (numpy array): Array of forecasted values
    
    Returns:
    float: MAPE value
    """
    if ignore:
        nonzero_actual = (actual != 0)
        nonzero_forecast = (forecast != 0)
        nonzero = np.logical_and(nonzero_actual, nonzero_forecast)
        actual = actual[nonzero] # delete elements of actual for which either actual=0 or forecast=0
        forecast = forecast[nonzero] # delete elements of forecasr for which either actual=0 or forecast=0
        # hence actual and forecast are both the same size
    else:
        eps = 1
        zero_actual = (actual == 0)
        zero_forecast = (forecast == 0)
        actual[zero_actual] = actual[zero_actual] + eps
        forecast[zero_forecast] = forecast[zero_forecast] + eps
    return np.mean(np.abs(actual - forecast)/actual)

## Loading the data

In [3]:
# Load the training input x_train from CSV file
x_train_data = pd.read_csv('data/x_train.csv')
x_train_data

Unnamed: 0,date,station,job,ferie,vacances
0,2015-01-01,1J7,1,1,1
1,2015-01-01,O2O,1,1,1
2,2015-01-01,8QR,1,1,1
3,2015-01-01,UMC,1,1,1
4,2015-01-01,FK3,1,1,1
...,...,...,...,...,...
1229858,2022-12-31,V2P,0,0,1
1229859,2022-12-31,N9K,0,0,1
1229860,2022-12-31,P6E,0,0,1
1229861,2022-12-31,BDC,0,0,1


In [4]:
# Convert the dates to datetime objects
x_train_data['date'] = pd.to_datetime(x_train_data['date'])

# Define a reference date
reference_date = x_train_data['date'][0]

# Calculate the number of days since the reference date for each date
x_train_data['date'] = (x_train_data['date'] - reference_date).dt.days

x_train_data

Unnamed: 0,date,station,job,ferie,vacances
0,0,1J7,1,1,1
1,0,O2O,1,1,1
2,0,8QR,1,1,1
3,0,UMC,1,1,1
4,0,FK3,1,1,1
...,...,...,...,...,...
1229858,2921,V2P,0,0,1
1229859,2921,N9K,0,0,1
1229860,2921,P6E,0,0,1
1229861,2921,BDC,0,0,1


In [5]:
# Exemple
print(x_train_data['station'][0])
print(x_train_data['station'][423])

print(x_train_data['station'][62])
print(x_train_data['station'][487])

1J7
1J7
BZ0
BZ0


In [6]:
x_train_data['station'], unique_ids = pd.factorize(x_train_data['station'])
x_train_data

Unnamed: 0,date,station,job,ferie,vacances
0,0,0,1,1,1
1,0,1,1,1,1
2,0,2,1,1,1
3,0,3,1,1,1
4,0,4,1,1,1
...,...,...,...,...,...
1229858,2921,431,0,0,1
1229859,2921,432,0,0,1
1229860,2921,434,0,0,1
1229861,2921,435,0,0,1


In [7]:
# Exemple
print(x_train_data['station'][0])
print(x_train_data['station'][423])

print(x_train_data['station'][62])
print(x_train_data['station'][487])

0
0
62
62


In [8]:
# Load the training output y_train from CSV file
y_train_data = pd.read_csv('data/y_train.csv')
print(y_train_data)

                  index     y
0        2015-01-01_1J7     7
1        2015-01-01_O2O     0
2        2015-01-01_8QR     9
3        2015-01-01_UMC     9
4        2015-01-01_FK3    28
...                 ...   ...
1229858  2022-12-31_V2P  1227
1229859  2022-12-31_N9K   544
1229860  2022-12-31_P6E    92
1229861  2022-12-31_BDC    91
1229862  2022-12-31_W14    18

[1229863 rows x 2 columns]


In [9]:
# Delete the index column
y_train_data.drop('index', axis=1, inplace=True)
y_train_data

Unnamed: 0,y
0,7
1,0
2,9
3,9
4,28
...,...
1229858,1227
1229859,544
1229860,92
1229861,91


## Training the linear regression

In [10]:
# Split data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(x_train_data, y_train_data, test_size=0.1, random_state=42)

# Initialize linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

In [11]:
# Predict on the testing set
y_pred = model.predict(X_val).flatten()
y_pred

array([1016.20658415, 4405.97066345, 2209.07823988, ..., 7175.50632291,
       7952.94994136, 1096.81889914])

In [12]:
y_val = y_val.values.flatten()
y_val

array([   92,  3580,   354, ...,  1179, 24940,    72])

## Evaluating the prediction on a validation set

In [13]:
# Calculate Mean Squared Error
print('Validation MAPE:', mape(y_val, y_pred))

Validation MAPE: 42.03480298235478


## Prediction on the test set

In [14]:
# Load and reformat the testing input x_test from CSV file
x_test = pd.read_csv('data/x_test.csv')
x_test.drop(columns=['index'], inplace=True)
x_test['date'] = pd.to_datetime(x_test['date'])
# NE PAS FAIRE reference_date = x_test['date'][0]
print("reference_date: ", reference_date)
x_test['date'] = (x_test['date'] - reference_date).dt.days
x_test['station'], unique_ids = pd.factorize(x_test['station'])
x_test

reference_date:  2015-01-01 00:00:00


Unnamed: 0,date,station,job,ferie,vacances
0,2922,0,0,1,1
1,2922,1,0,1,1
2,2922,2,0,1,1
3,2922,3,0,1,1
4,2922,4,0,1,1
...,...,...,...,...,...
78647,3072,428,1,0,0
78648,3072,429,1,0,0
78649,3072,430,1,0,0
78650,3072,438,1,0,0


In [15]:
# Initialize linear regression model
model = LinearRegression()

# Train the model on the entire x and y training data
model.fit(x_train_data, y_train_data)

# Prediction and test values
y_test_pred = model.predict(x_test)
y_test_pred

array([[-4824.96319729],
       [-4805.9670311 ],
       [-4786.97086491],
       ...,
       [ 8880.37354046],
       [ 9032.34286996],
       [ 8899.36970665]])

## Reformating for submission

In [16]:
y_submit = pd.read_csv('data/submission_example.csv')
y_submit

Unnamed: 0,index,y
0,2023-01-01_1J7,10000
1,2023-01-01_O2O,10000
2,2023-01-01_8QR,10000
3,2023-01-01_L58,10000
4,2023-01-01_UMC,10000
...,...,...
78647,2023-05-31_N9K,10000
78648,2023-05-31_P6E,10000
78649,2023-05-31_BDC,10000
78650,2023-05-31_QD6,10000


In [17]:
y_submit['y'] = y_test_pred
y_submit

Unnamed: 0,index,y
0,2023-01-01_1J7,-4824.963197
1,2023-01-01_O2O,-4805.967031
2,2023-01-01_8QR,-4786.970865
3,2023-01-01_L58,-4767.974699
4,2023-01-01_UMC,-4748.978533
...,...,...
78647,2023-05-31_N9K,8842.381208
78648,2023-05-31_P6E,8861.377374
78649,2023-05-31_BDC,8880.373540
78650,2023-05-31_QD6,9032.342870


In [18]:
filename = 'submissions/new_submission.csv'
y_submit.to_csv(filename, index=False)