In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import holidays
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from prophet import Prophet
from statsmodels.tsa.statespace.sarimax import SARIMAX
from feature_engineering import codify_date, codify_date_2, remove_outliers, get_X_y, covid_19, covid_19_2, add_weather


In [2]:
df = pd.read_parquet(Path("data") / "train.parquet")
test = pd.read_parquet(Path("data") / "final_test.parquet")

test_old = test.copy()


In [3]:
test_old.columns

Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude'],
      dtype='object')

In [4]:
df = codify_date_2(df)
test = codify_date_2(test)

In [5]:
features = ['hour', 'month', 'IsHoliday', 'day', "day_of_week", "is_weekend"]
target = 'log_bike_count'

X = df[features]
y = df[target]

model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

print("Negative MSE scores from cross-validation:", cv_scores)
print("Average Negative MSE:", cv_scores.mean())

Negative MSE scores from cross-validation: [-0.96288667 -0.55461495 -0.64306749 -0.95816682 -1.43161343]
Average Negative MSE: -0.9100698711407886


In [6]:
model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

model.fit(X, y)

X_test = test[features]

y_pred_test = model.predict(X_test)

output_df = pd.DataFrame({
    'Id': test.index,
    'log_bike_count': y_pred_test
})


print(output_df.head())



   Id  log_bike_count
0   0        1.471516
1   1        4.456159
2   2        5.158623
3   3        4.210475
4   4        3.285660


# Now including weather

In [7]:
df = add_weather(df)
test = add_weather(test)

In [8]:
test

Unnamed: 0_level_0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,...,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
original_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,False,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,False,294.05,1.6,89,0.0,3.7,1.8,25,0.0,-30
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,False,296.35,0.0,69,0.0,7.6,5.1,1,0.0,-10
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,False,293.65,0.8,81,0.0,5.8,2.7,81,1.0,130
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,False,291.25,0.0,86,0.0,4.0,1.8,1,1.0,190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51435,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,False,284.35,0.0,86,0.0,3.9,2.4,3,0.0,70
51436,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,False,291.55,0.0,73,0.0,7.7,5.2,3,0.0,-40
51437,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,False,291.55,0.0,73,0.0,7.7,5.2,3,0.0,-40
51438,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,2021-10-18,2020-11-29,"48.83977,2.30198",Y2H20114504,48.839770,2.301980,...,False,290.05,-0.1,83,0.0,2.7,1.9,61,0.0,40


## Ab hier model

In [9]:
weather_columns = ['t', 'rr1', 'ff', 'raf10', 'u', 'ww', 'etat_sol', 'tend']

features = ['hour', 'month', 'IsHoliday', 'day', "day_of_week", "is_weekend"] + weather_columns
target = 'log_bike_count'

X = df[features]
y = df[target]

model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

print("Negative MSE scores from cross-validation:", cv_scores)
print("Average Negative MSE:", cv_scores.mean())


Negative MSE scores from cross-validation: [-0.95660446 -0.55033323 -0.63968072 -0.952396   -1.42777054]
Average Negative MSE: -0.9053569912285777


In [10]:
model = HistGradientBoostingRegressor(max_iter=100, random_state=42)

model.fit(X, y)

X_test = test[features]

y_pred_test = model.predict(X_test)

output_df = pd.DataFrame({
    'Id': test.index,
    'log_bike_count': y_pred_test
})


print(output_df.head())



   Id  log_bike_count
0   0        1.497950
1   1        3.995327
2   2        4.981202
3   3        3.821747
4   4        3.263629


In [11]:
output_df.to_csv("submission_maxim.csv", index=False)

In [12]:
output_df.shape

(51440, 2)