In [None]:
# Data processing
# ==============================================================================
import numpy as np
import pandas as pd
from feature_engineering import codify_date_2, remove_outliers, covid_19_2
from feature_engineering import add_weather, one_hot_encode, cyclic_transform
from utils import handle_missing_values
from pathlib import Path

# Plots
# ==============================================================================
import matplotlib.pyplot as plt

# Modelling and Forecasting
# ==============================================================================
from skforecast.recursive import ForecasterRecursiveMultiSeries
from skforecast.preprocessing import RollingFeatures
from skforecast.preprocessing import series_long_to_dict
from skforecast.preprocessing import exog_long_to_dict
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
data = codify_date_2(data)
data = remove_outliers(data)
data = covid_19_2(data)
data = add_weather(data)
data = handle_missing_values(data, "linear")
data = data.drop(columns=["counter_id", "site_id", "site_name", 
                                "counter_installation_date", 
                              "coordinates", "counter_technical_id",
                              "latitude", "longitude", "date", "bike_count"])
data.head()

  data.groupby(["counter_name", "date_truncated"])
  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Columns with missing values and their counts:
rr1         1326
ht_neige    7232
raf10       1650
etat_sol    8022
dtype: int64


Unnamed: 0,StringencyIndex_Average,counter_name,log_bike_count,datetime,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,46.76,Face au 8 avenue de la porte de Charenton NO-SE,1.098612,2020-09-01 01:00:00,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,46.76,Voie Georges Pompidou NE-SO,0.0,2020-09-01 01:00:00,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,46.76,20 Avenue de Clichy NO-SE,2.079442,2020-09-01 01:00:00,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,46.76,Pont des Invalides S-N,0.0,2020-09-01 01:00:00,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,46.76,39 quai François Mauriac NO-SE,1.098612,2020-09-01 01:00:00,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [3]:
X_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_test = codify_date_2(X_test)
X_test = covid_19_2(X_test)
X_test = add_weather(X_test)
X_test = handle_missing_values(X_test, "linear")

X_test.head()

  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Columns with missing values and their counts:
rr1         165
ht_neige    660
etat_sol    498
dtype: int64


Unnamed: 0,date,StringencyIndex_Average,counter_id,counter_name,site_id,site_name,counter_installation_date,coordinates,counter_technical_id,latitude,...,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,2021-09-10,43.77,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,...,False,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
1,2021-09-10,43.77,100036718-103036718,39 quai François Mauriac SE-NO,100036718,39 quai François Mauriac,2017-07-12,"48.83436,2.377",Y2H17021629,48.83436,...,False,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
2,2021-09-10,43.77,100057380-104057380,Totem Cours la Reine E-O,100057380,Totem Cours la Reine,2020-02-11,"48.86462,2.31444",YTH19111509,48.86462,...,False,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
3,2021-09-10,43.77,100056223-SC,Pont des Invalides N-S,100056223,Pont des Invalides N-S,2019-11-07,"48.86284,2.310345",Y2H19070365,48.86284,...,False,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
4,2021-09-10,43.77,100056226-104056226,Face au 8 avenue de la porte de Charenton NO-SE,100056226,Face au 8 avenue de la porte de Charenton,2019-11-01,"48.830331,2.400551",Y2H19070370,48.830331,...,False,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0


In [4]:
series_train = data[["counter_name", "datetime", "log_bike_count"]]
exog_train = data[["counter_name", "datetime", 't', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend', "StringencyIndex_Average", "year", "month", "hour", "day_of_week", "IsHoliday"]]

categorical_columns = ["year", "month", "day", "hour", "day_of_week", "IsHoliday"]

# Apply transformations
exog_train["tend"] = exog_train["tend"].astype(float)
exog_train["u"] = exog_train["u"].astype(float)
exog_train["ww"] = exog_train["ww"].astype(float)
exog_train["year"] = exog_train["year"].astype(float)
exog_train["IsHoliday"] = exog_train["IsHoliday"].astype(float)
exog_train = cyclic_transform(exog_train, "hour", 24)
exog_train = one_hot_encode(exog_train, ["month", "day_of_week"])
exog_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exog_train["tend"] = exog_train["tend"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exog_train["u"] = exog_train["u"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exog_train["ww"] = exog_train["ww"].astype(float)
A value is trying to be set on a copy of a slice from

Unnamed: 0,counter_name,datetime,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,...,month_10,month_11,month_12,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,Face au 8 avenue de la porte de Charenton NO-SE,2020-09-01 01:00:00,285.75,0.0,81.0,0.0,2.4,1.6,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Voie Georges Pompidou NE-SO,2020-09-01 01:00:00,285.75,0.0,81.0,0.0,2.4,1.6,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,20 Avenue de Clichy NO-SE,2020-09-01 01:00:00,285.75,0.0,81.0,0.0,2.4,1.6,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Pont des Invalides S-N,2020-09-01 01:00:00,285.75,0.0,81.0,0.0,2.4,1.6,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,39 quai François Mauriac NO-SE,2020-09-01 01:00:00,285.75,0.0,81.0,0.0,2.4,1.6,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [33]:
series_dict_train = series_long_to_dict(
    data      = series_train,
    series_id = 'counter_name',
    index     = 'datetime',
    values    = 'log_bike_count',
    freq      = 'H'
)

exog_dict_train = exog_long_to_dict(
    data      = exog_train,
    series_id = 'counter_name',
    index     = 'datetime',
    freq      = 'H'
)

  original_sizes = data.groupby(series_id).size()
  for k, v in data.groupby(series_id):
  series_dict[k] = v.set_index(index)[values].asfreq(freq).rename(k)
  original_sizes = data.groupby(series_id).size()
  exog_dict = dict(tuple(data.groupby(series_id)))
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.se

In [None]:
exog_test = X_test[["counter_name", "datetime", 't', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend', "StringencyIndex_Average", "year", "month", "hour", "day_of_week", "IsHoliday"]]


# Apply sine and cosine transformations
exog_test["tend"] = exog_test["tend"].astype(float)
exog_test["u"] = exog_test["u"].astype(float)
exog_test["ww"] = exog_test["ww"].astype(float)
exog_test["year"] = exog_test["year"].astype(float)
exog_test["IsHoliday"] = exog_test["IsHoliday"].astype(float)
exog_test = cyclic_transform(exog_test, "hour", 24)
exog_test = one_hot_encode(exog_test, ["month", "day_of_week"])
exog_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exog_test["tend"] = exog_test["tend"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exog_test["u"] = exog_test["u"].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exog_test["ww"] = exog_test["ww"].astype(float)
A value is trying to be set on a copy of a slice from a Dat

Unnamed: 0,counter_name,datetime,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,...,hour_cos,month_9,month_10,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
0,28 boulevard Diderot E-O,2021-09-10 01:00:00,291.45,0.0,94.0,0.0,3.4,2.0,0.0,1.0,...,0.965926,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,39 quai François Mauriac SE-NO,2021-09-10 01:00:00,291.45,0.0,94.0,0.0,3.4,2.0,0.0,1.0,...,0.965926,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Totem Cours la Reine E-O,2021-09-10 01:00:00,291.45,0.0,94.0,0.0,3.4,2.0,0.0,1.0,...,0.965926,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Pont des Invalides N-S,2021-09-10 01:00:00,291.45,0.0,94.0,0.0,3.4,2.0,0.0,1.0,...,0.965926,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Face au 8 avenue de la porte de Charenton NO-SE,2021-09-10 01:00:00,291.45,0.0,94.0,0.0,3.4,2.0,0.0,1.0,...,0.965926,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [35]:
exog_dict_test = exog_long_to_dict(
    data      = exog_test,
    series_id = 'counter_name',
    index     = 'datetime',
    freq      = 'H'
)

  original_sizes = data.groupby(series_id).size()
  exog_dict = dict(tuple(data.groupby(series_id)))
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(columns=series_id)
  k: v.set_index(index).asfreq(freq).drop(col

In [36]:
# Fit forecaster
# ==============================================================================
window_features = RollingFeatures(stats=['mean', 'mean'], window_sizes=[24, 168])
forecaster = ForecasterRecursiveMultiSeries(
                regressor          = HistGradientBoostingRegressor(random_state=123),
                lags               = [1, 24, 168],
                window_features    = window_features,
                encoding           = "ordinal",
                dropna_from_series = False
            )

forecaster.fit(series=series_dict_train, exog=exog_dict_train, suppress_warnings=True)
forecaster

In [37]:
predictions = forecaster.predict(steps=1020, exog=exog_dict_test)

    `last_window` ends at : 2021-09-09 23:00:00.
    `exog` for series '152 boulevard du Montparnasse E-O' starts at : 2021-09-10 01:00:00.
     Expected index       : 2021-09-10 00:00:00. 
    `last_window` ends at : 2021-09-09 23:00:00.
    `exog` for series '152 boulevard du Montparnasse O-E' starts at : 2021-09-10 01:00:00.
     Expected index       : 2021-09-10 00:00:00. 
    `last_window` ends at : 2021-09-09 23:00:00.
    `exog` for series '18 quai de l'Hôtel de Ville NO-SE' starts at : 2021-09-10 01:00:00.
     Expected index       : 2021-09-10 00:00:00. 
    `last_window` ends at : 2021-09-09 23:00:00.
    `exog` for series '18 quai de l'Hôtel de Ville SE-NO' starts at : 2021-09-10 01:00:00.
     Expected index       : 2021-09-10 00:00:00. 
    `last_window` ends at : 2021-09-09 23:00:00.
    `exog` for series '20 Avenue de Clichy NO-SE' starts at : 2021-09-10 01:00:00.
     Expected index       : 2021-09-10 00:00:00. 
    `last_window` ends at : 2021-09-09 23:00:00.
    `exog

In [38]:
predictions = predictions.reset_index(names='datetime').melt(id_vars=['datetime'], var_name='counter_name', value_name='log_bike_count')
predictions.head()

Unnamed: 0,datetime,counter_name,log_bike_count
0,2021-09-10 00:00:00,152 boulevard du Montparnasse E-O,1.921563
1,2021-09-10 01:00:00,152 boulevard du Montparnasse E-O,1.236673
2,2021-09-10 02:00:00,152 boulevard du Montparnasse E-O,0.946127
3,2021-09-10 03:00:00,152 boulevard du Montparnasse E-O,1.133086
4,2021-09-10 04:00:00,152 boulevard du Montparnasse E-O,2.581472


In [41]:
X_original = pd.read_parquet(Path("data") / "final_test.parquet")
X_original.loc[:, "index1"] = X_original.index
X_original = codify_date_2(X_original)
X_original.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,index1,datetime,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0,2021-09-10 01:00:00,2021,9,10,4,1,False,False
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1,2021-09-10 13:00:00,2021,9,10,4,13,False,False
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2,2021-09-10 17:00:00,2021,9,10,4,17,False,False
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,3,2021-09-10 19:00:00,2021,9,10,4,19,False,False
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,4,2021-09-10 22:00:00,2021,9,10,4,22,False,False


In [42]:
merged_df = X_original.merge(predictions, on=['counter_name', 'datetime'], how='left')
merged_df.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,index1,datetime,year,month,day,day_of_week,hour,is_weekend,IsHoliday,log_bike_count
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0,2021-09-10 01:00:00,2021,9,10,4,1,False,False,0.190413
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1,2021-09-10 13:00:00,2021,9,10,4,13,False,False,1.82354
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2,2021-09-10 17:00:00,2021,9,10,4,17,False,False,1.847571
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,3,2021-09-10 19:00:00,2021,9,10,4,19,False,False,0.830286
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,4,2021-09-10 22:00:00,2021,9,10,4,22,False,False,0.395662


In [44]:
results = pd.DataFrame(
    dict(
        Id=np.arange(len(merged_df["log_bike_count"])),
        log_bike_count=merged_df["log_bike_count"],
    )
)
results.to_csv("submission.csv", index=False)

In [5]:
import session_info
session_info.show(html=False)

-----
feature_engineering NA
matplotlib          3.9.2
numpy               2.0.2
pandas              2.2.3
session_info        1.0.0
skforecast          0.14.0
sklearn             1.5.2
utils               NA
-----
IPython             8.30.0
jupyter_client      8.6.3
jupyter_core        5.7.2
jupyterlab          4.3.3
notebook            7.3.1
-----
Python 3.12.4 | packaged by conda-forge | (main, Jun 17 2024, 10:13:44) [Clang 16.0.6 ]
macOS-15.0.1-arm64-arm-64bit
-----
Session information updated at 2024-12-11 20:37


In [None]:
pip install matplotlib==3.9.2
pip install numpy==2.0.2
pip install pandas==2.2.3
pip install scikit-learn==1.5.2
pip install skforecast==0.14.0

SyntaxError: invalid syntax (209920656.py, line 1)