# Model Testing

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from feature_engineering import codify_date, codify_date_2, remove_outliers, get_X_y, covid_19, covid_19_2
from feature_engineering import add_weather, add_lag_and_rolling_features
from utils import handle_missing_values
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn import linear_model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
data_test = pd.read_parquet(Path("data") / "final_test.parquet")

### Simplest Model
Variables: Date (hour, weekdaay, daymonth, IsHoliday), Counter Name

Outliers: Included

In [3]:
mdata = codify_date(data)
X, y = get_X_y(mdata)
X.head()

Unnamed: 0,counter_name,hour,weekday,daymonth,IsHoliday
48321,28 boulevard Diderot E-O,2,1,01_9,False
48324,28 boulevard Diderot E-O,3,1,01_9,False
48327,28 boulevard Diderot E-O,4,1,01_9,False
48330,28 boulevard Diderot E-O,15,1,01_9,False
48333,28 boulevard Diderot E-O,18,1,01_9,False


In [33]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "Lasso": linear_model.Lasso(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.95475 +- 0.163
XG Boost - The accuracy is: -0.96059 +- 0.162
Lasso - The accuracy is: -1.67937 +- 0.135
LightGBM - The accuracy is: -0.95777 +- 0.162


### Simplest Model (different Date Codification)


In [5]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
X, y = get_X_y(mdata)
X.head()

Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
48321,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
48324,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
48327,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
48330,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
48333,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.92597 +- 0.175
XG Boost - The accuracy is: -0.91152 +- 0.176
LightGBM - The accuracy is: -0.92558 +- 0.175


### Without Outliers

In [3]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.86118 +- 0.120
XG Boost - The accuracy is: -0.83952 +- 0.122
LightGBM - The accuracy is: -0.85560 +- 0.121


### Sine Cossine Encoding

In [8]:
from sklearn.preprocessing import FunctionTransformer


def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [24]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [30]:
categorical_columns = ["counter_name", "year", "day", "is_weekend", "IsHoliday", "month", "day_of_week"]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ("hour_sin", sin_transformer(24), ["hour"]),
        ("hour_cos", cos_transformer(24), ["hour"]),
    ],
#    remainder=MinMaxScaler(),
)

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.83913 +- 0.122
XG Boost - The accuracy is: -0.82384 +- 0.130
LightGBM - The accuracy is: -0.83852 +- 0.122


### Sine And Cosine Features without day

In [31]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata).drop(columns="day")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,18,False,False


In [32]:
categorical_columns = ["counter_name", "year", "is_weekend", "IsHoliday", "month", "day_of_week"]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ("hour_sin", sin_transformer(24), ["hour"]),
        ("hour_cos", cos_transformer(24), ["hour"]),
    ],
#    remainder=MinMaxScaler(),
)

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.85674 +- 0.121
XG Boost - The accuracy is: -0.85017 +- 0.122
LightGBM - The accuracy is: -0.85558 +- 0.123


### Periodic Spline Features

In [13]:
from sklearn.preprocessing import SplineTransformer


def periodic_spline_transformer(period, n_splines=None, degree=3):
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1  # periodic and include_bias is True
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True,
    )

In [28]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [29]:
categorical_columns = ["counter_name", "year", "day", "is_weekend", "IsHoliday", "month", "day_of_week"]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ("cyclic_hour", periodic_spline_transformer(24, n_splines=12), ["hour"]),
    ],
#    remainder=MinMaxScaler(),
)

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.84487 +- 0.124
XG Boost - The accuracy is: -0.82768 +- 0.124
LightGBM - The accuracy is: -0.84149 +- 0.121


### Without Outliers, With 1/0 Covid-19 Column

In [3]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,Covid-19
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False,0
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False,0
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False,0
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False,0
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False,0


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.85981 +- 0.119
XG Boost - The accuracy is: -0.83691 +- 0.124
LightGBM - The accuracy is: -0.85730 +- 0.122


### Without Outliers, With Index for Covid-19

In [4]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19_2(mdata)
mdata.isnull().sum()
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Unnamed: 0,StringencyIndex_Average,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
1,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
2,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
3,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
4,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False


In [None]:

numerical_columns = ["StringencyIndex_Average"]
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -1.56809 +- 0.090
XG Boost - The accuracy is: -1.57179 +- 0.088
LightGBM - The accuracy is: -1.57015 +- 0.092


### With Weather Data

Outliers: Removed

Missing Values: Linear Interpolation

In [3]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         3984
ht_neige    5232
raf10       2640
etat_sol    5232
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 48 quai de la marne NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,6 rue Julia Bartet SO-NE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,18 quai de l'Hôtel de Ville SE-NO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Face au 4 avenue de la porte de Bagnolet O-E,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [4]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.76020 +- 0.100
XG Boost - The accuracy is: -0.74025 +- 0.106
LightGBM - The accuracy is: -0.75816 +- 0.096


Outliers: Included

Missing Values: Linear Interpolation

In [6]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

Columns with missing values and their counts:
rr1         4032
ht_neige    5376
raf10       2688
etat_sol    5376
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,27 quai de la Tournelle SE-NO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,90 Rue De Sèvres SO-NE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,152 boulevard du Montparnasse O-E,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Face au 25 quai de l'Oise NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,Pont de la Concorde S-N,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [7]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.82307 +- 0.095
XG Boost - The accuracy is: -0.80880 +- 0.093
LightGBM - The accuracy is: -0.81815 +- 0.095


Outliers: Removed

Missing Values: Spline Interpolation

In [8]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "spline")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         3984
ht_neige    5232
raf10       2640
etat_sol    5232
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 48 quai de la marne NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,6 rue Julia Bartet SO-NE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,18 quai de l'Hôtel de Ville SE-NO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Face au 4 avenue de la porte de Bagnolet O-E,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [None]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.75595 +- 0.107
XG Boost - The accuracy is: -0.74025 +- 0.106
LightGBM - The accuracy is: -0.75816 +- 0.096


Outliers: Removed

Missing Values: Median

In [43]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,Pont de la Concorde S-N,2020,9,1,1,14,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,Pont de la Concorde S-N,2020,9,1,1,13,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Pont de la Concorde S-N,2020,9,1,1,12,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,Pont de la Concorde S-N,2020,9,1,1,9,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [None]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy='median'), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.71206 +- 0.114
[-0.91307228 -0.82579268 -0.6287237  -0.58306573 -0.80402816 -0.66826839
 -0.69016106 -0.58337952]
XG Boost - The accuracy is: -0.68451 +- 0.138
[-0.93379312 -0.82421118 -0.61511037 -0.52272333 -0.79160148 -0.60049783
 -0.64231291 -0.54582169]
LightGBM - The accuracy is: -0.70910 +- 0.117
[-0.91169942 -0.83134498 -0.62996563 -0.56850314 -0.80232144 -0.64839326
 -0.69684392 -0.58370193]


### With Lags

Check all features, one by one

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = add_lag_and_rolling_features(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  data.groupby('counter_id')


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,...,ww,etat_sol,tend,lag_1,lag_24,lag_168,rolling_mean_24h,rolling_std_24h,rolling_mean_7d,rolling_std_7d
169,28 boulevard Diderot O-E,2020,9,8,1,2,False,False,288.25,0.0,...,1,0.0,50,1.0,2.0,2.0,74.416667,70.794201,65.089286,58.821747
170,28 boulevard Diderot O-E,2020,9,8,1,3,False,False,288.25,0.0,...,1,0.0,50,4.0,2.0,7.0,74.625,70.578857,65.089286,58.821747
171,28 boulevard Diderot O-E,2020,9,8,1,4,False,False,288.25,0.0,...,1,0.0,50,7.0,4.0,10.0,74.875,70.327997,65.089286,58.821747
172,28 boulevard Diderot O-E,2020,9,8,1,5,False,False,288.25,0.0,...,1,0.0,50,10.0,48.0,48.0,75.416667,70.161873,65.166667,58.807681
173,28 boulevard Diderot O-E,2020,9,8,1,6,False,False,288.25,0.0,...,1,0.0,50,61.0,133.0,95.0,76.125,70.850923,65.494048,59.126982


In [9]:
weather_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", 
                       "hour", "is_weekend", "IsHoliday"]
lag_columns = ["lag_1", "lag_24", "lag_168", "rolling_mean_24h", "rolling_std_24h", 
               "rolling_mean_7d", "rolling_std_7d"]

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy='median'), weather_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
}

for i in range(0, len(lag_columns)):
    current_lag_columns = [lag_columns[i]]  # Add one lag column at a time
    print(f"\nEvaluating with lag columns: {current_lag_columns}")
    
    # Update the preprocessor to include the current lag columns
    preprocessor = ColumnTransformer([
        ("num", SimpleImputer(strategy='median'), weather_columns + current_lag_columns),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ])
    
    for regressor_name, regressor in regressors.items():
        # Define the pipeline
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", regressor)
        ])
        
        # Perform cross-validation
        scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                                 n_jobs=-1, scoring="neg_root_mean_squared_error")
        
        # Print results
        print(f"{regressor_name} - RMSE: {abs(scores.mean()):.5f} ± {scores.std():.3f}")


Evaluating with lag columns: ['lag_1']
XG Boost - RMSE: 0.46668 ± 0.040
LightGBM - RMSE: 0.46949 ± 0.035

Evaluating with lag columns: ['lag_24']
XG Boost - RMSE: 0.50796 ± 0.041
LightGBM - RMSE: 0.50793 ± 0.035

Evaluating with lag columns: ['lag_168']
XG Boost - RMSE: 0.50734 ± 0.056
LightGBM - RMSE: 0.50976 ± 0.050

Evaluating with lag columns: ['rolling_mean_24h']
XG Boost - RMSE: 0.52590 ± 0.037
LightGBM - RMSE: 0.55538 ± 0.031

Evaluating with lag columns: ['rolling_std_24h']
XG Boost - RMSE: 0.59439 ± 0.047
LightGBM - RMSE: 0.60560 ± 0.039

Evaluating with lag columns: ['rolling_mean_7d']
XG Boost - RMSE: 0.53918 ± 0.042
LightGBM - RMSE: 0.57278 ± 0.034

Evaluating with lag columns: ['rolling_std_7d']
XG Boost - RMSE: 0.57369 ± 0.055
LightGBM - RMSE: 0.60359 ± 0.047


# Submission

In [16]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
#mdata = add_weather(mdata)
#mdata = handle_missing_values(mdata, "spline")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [26]:
X_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_test = codify_date_2(X_test)
#X_test = add_weather(X_test)
#X_test = handle_missing_values(X_test, "spline")
X_test = X_test.drop(columns=["counter_id", "site_id", "site_name", 
                                "counter_installation_date", 
                              "coordinates", "counter_technical_id",
                              "latitude", "longitude", "datetime", "date"])
X_test.head()

Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2021,9,10,4,1,False,False
1,28 boulevard Diderot E-O,2021,9,10,4,13,False,False
2,28 boulevard Diderot E-O,2021,9,10,4,17,False,False
3,28 boulevard Diderot E-O,2021,9,10,4,19,False,False
4,28 boulevard Diderot E-O,2021,9,10,4,22,False,False


In [27]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
#categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]
categorical_columns = ["counter_name", "year", "day", "is_weekend", "IsHoliday", "month", "day_of_week"]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ("hour_sin", sin_transformer(24), ["hour"]),
        ("hour_cos", cos_transformer(24), ["hour"]),
    ],
#    remainder=MinMaxScaler(),
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor())
])

# Fit the pipeline to the training data
pipeline.fit(X, y)

y_pred = pipeline.predict(X_test)
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)

In [27]:
data["date"].min()

Timestamp('2020-09-01 00:00:00')

In [28]:
data["date"].max()

Timestamp('2021-09-09 00:00:00')