# Model Testing

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
import utils
from feature_engineering import codify_date, codify_date_2, remove_outliers, get_X_y, covid_19, covid_19_2
from feature_engineering import add_weather, add_lag_and_rolling_features
from utils import handle_missing_values
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn import linear_model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
data_test = pd.read_parquet(Path("data") / "final_test.parquet")

### Simplest Model
Variables: Date (hour, weekdaay, daymonth, IsHoliday), Counter Name

Outliers: Included

In [3]:
mdata = codify_date(data)
X, y = get_X_y(mdata)
X.head()

Unnamed: 0,counter_name,hour,weekday,daymonth,IsHoliday
48321,28 boulevard Diderot E-O,2,1,01_9,False
48324,28 boulevard Diderot E-O,3,1,01_9,False
48327,28 boulevard Diderot E-O,4,1,01_9,False
48330,28 boulevard Diderot E-O,15,1,01_9,False
48333,28 boulevard Diderot E-O,18,1,01_9,False


In [33]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "Lasso": linear_model.Lasso(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.95475 +- 0.163
XG Boost - The accuracy is: -0.96059 +- 0.162
Lasso - The accuracy is: -1.67937 +- 0.135
LightGBM - The accuracy is: -0.95777 +- 0.162


### Simplest Model (different Date Codification)


In [5]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
X, y = get_X_y(mdata)
X.head()

Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
48321,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
48324,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
48327,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
48330,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
48333,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.92597 +- 0.175
XG Boost - The accuracy is: -0.91152 +- 0.176
LightGBM - The accuracy is: -0.92558 +- 0.175


### Without Outliers

In [3]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.86118 +- 0.120
XG Boost - The accuracy is: -0.83952 +- 0.122
LightGBM - The accuracy is: -0.85560 +- 0.121


### Sine Cossine Encoding

In [3]:
from sklearn.preprocessing import FunctionTransformer


def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [24]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [30]:
categorical_columns = ["counter_name", "year", "day", "is_weekend", "IsHoliday", "month", "day_of_week"]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ("hour_sin", sin_transformer(24), ["hour"]),
        ("hour_cos", cos_transformer(24), ["hour"]),
    ],
#    remainder=MinMaxScaler(),
)

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.83913 +- 0.122
XG Boost - The accuracy is: -0.82384 +- 0.130
LightGBM - The accuracy is: -0.83852 +- 0.122


### Sine And Cosine Features without day

In [31]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata).drop(columns="day")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,18,False,False


In [32]:
categorical_columns = ["counter_name", "year", "is_weekend", "IsHoliday", "month", "day_of_week"]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ("hour_sin", sin_transformer(24), ["hour"]),
        ("hour_cos", cos_transformer(24), ["hour"]),
    ],
#    remainder=MinMaxScaler(),
)

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.85674 +- 0.121
XG Boost - The accuracy is: -0.85017 +- 0.122
LightGBM - The accuracy is: -0.85558 +- 0.123


### Periodic Spline Features

In [13]:
from sklearn.preprocessing import SplineTransformer


def periodic_spline_transformer(period, n_splines=None, degree=3):
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1  # periodic and include_bias is True
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True,
    )

In [28]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [29]:
categorical_columns = ["counter_name", "year", "day", "is_weekend", "IsHoliday", "month", "day_of_week"]

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ("cyclic_hour", periodic_spline_transformer(24, n_splines=12), ["hour"]),
    ],
#    remainder=MinMaxScaler(),
)

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.84487 +- 0.124
XG Boost - The accuracy is: -0.82768 +- 0.124
LightGBM - The accuracy is: -0.84149 +- 0.121


### Without Outliers, With 1/0 Covid-19 Column

In [3]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,Covid-19
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False,0
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False,0
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False,0
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False,0
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False,0


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.85981 +- 0.119
XG Boost - The accuracy is: -0.83691 +- 0.124
LightGBM - The accuracy is: -0.85730 +- 0.122


### Without Outliers, With Index for Covid-19

In [42]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19_2(mdata)
mdata.isnull().sum()
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Unnamed: 0,StringencyIndex_Average,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,46.76,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,46.76,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,46.76,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,46.76,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,46.76,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [43]:

numerical_columns = ["StringencyIndex_Average"]
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
], remainder='passthrough')

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.84607 +- 0.120
XG Boost - The accuracy is: -0.83515 +- 0.123
LightGBM - The accuracy is: -0.84432 +- 0.119


### With Weather Data

Outliers: Removed

Missing Values: Linear Interpolation

In [44]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         1326
ht_neige    7232
raf10       1650
etat_sol    8022
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 8 avenue de la porte de Charenton NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,Voie Georges Pompidou NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,20 Avenue de Clichy NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Pont des Invalides S-N,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [46]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
], remainder='passthrough')

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.69918 +- 0.122
XG Boost - The accuracy is: -0.70420 +- 0.143
LightGBM - The accuracy is: -0.69606 +- 0.123


Outliers: Removed

Missing Values: Spline Interpolation

In [47]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "spline")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         1326
ht_neige    7232
raf10       1650
etat_sol    8022
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 8 avenue de la porte de Charenton NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,Voie Georges Pompidou NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,20 Avenue de Clichy NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Pont des Invalides S-N,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [48]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
], remainder='passthrough')

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.70025 +- 0.124
XG Boost - The accuracy is: -0.70420 +- 0.143
LightGBM - The accuracy is: -0.69606 +- 0.123


Outliers: Removed

Missing Values: Median

In [49]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0_level_0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
original_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False,283.95,0.0,88,0.0,1.5,1.1,2,0.0,-60
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False,283.95,0.0,88,0.0,1.5,1.1,2,0.0,-60
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False,293.65,0.0,41,0.0,7.5,4.0,3,0.0,-110
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False,292.15,0.0,47,0.0,6.5,3.0,2,0.0,10


In [50]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("num", SimpleImputer(strategy='median'), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
], remainder='passthrough')

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.85079 +- 0.119
XG Boost - The accuracy is: -0.83677 +- 0.123
LightGBM - The accuracy is: -0.84512 +- 0.115


### Weather Data with Sin and Cos

In [51]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         1326
ht_neige    7232
raf10       1650
etat_sol    8022
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 8 avenue de la porte de Charenton NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,Voie Georges Pompidou NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,20 Avenue de Clichy NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Pont des Invalides S-N,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [52]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

regressors = {"XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

XG Boost - The accuracy is: -0.66857 +- 0.134
LightGBM - The accuracy is: -0.68253 +- 0.124


### Weather Data, Sin Cos, Covid Index

In [53]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19_2(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Columns with missing values and their counts:
rr1         1326
ht_neige    7232
raf10       1650
etat_sol    8022
dtype: int64


Unnamed: 0,StringencyIndex_Average,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,46.76,Face au 8 avenue de la porte de Charenton NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,46.76,Voie Georges Pompidou NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,46.76,20 Avenue de Clichy NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,46.76,Pont des Invalides S-N,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,46.76,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [54]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

regressors = {"XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

XG Boost - The accuracy is: -0.65841 +- 0.144
LightGBM - The accuracy is: -0.66109 +- 0.130


In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import numpy as np

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

# Train the models and extract feature importance
for regressor_name, regressor in regressors.items():
    
    X_transformed = preprocessor.fit_transform(X)
    
    feature_names = []
    for name, transformer, columns in preprocessor.transformers_:
        if name == "cat":  # For OneHotEncoder
            feature_names.extend(transformer.get_feature_names_out(columns))
        else:
            feature_names.extend(columns)  # Numerical and other features
    
    # Fit the regressor
    regressor.fit(X_transformed, y)
    
    # Get feature importances
    importances = regressor.feature_importances_

    # Combine feature names and importances
    importance_df = pd.DataFrame({
        "Feature": feature_names,
        "Importance": importances
    }).sort_values(by="Importance", ascending=False)
    
    print(f"\n{regressor_name} Feature Importances:")
    display(importance_df.head(40))


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




XG Boost Feature Importances:


Unnamed: 0,Feature,Importance
113,hour,0.058037
10,counter_name_28 boulevard Diderot E-O,0.05624
47,counter_name_Totem 64 Rue de Rivoli O-E,0.052891
49,counter_name_Totem 73 boulevard de Sébastopol S-N,0.052319
29,counter_name_Face au 40 quai D'Issy NE-SO,0.049249
48,counter_name_Totem 73 boulevard de Sébastopol N-S,0.047056
30,counter_name_Face au 40 quai D'Issy SO-NE,0.04683
9,counter_name_27 quai de la Tournelle SE-NO,0.043471
20,counter_name_67 boulevard Voltaire SE-NO,0.039852
28,counter_name_Face au 4 avenue de la porte de B...,0.034986


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).




LightGBM Feature Importances:


Unnamed: 0,Feature,Importance
112,hour,320
113,hour,247
114,8,183
108,is_weekend_False,105
10,counter_name_28 boulevard Diderot E-O,65
122,16,59
116,10,59
49,counter_name_Totem 73 boulevard de Sébastopol S-N,59
30,counter_name_Face au 40 quai D'Issy SO-NE,57
61,month_4,56


### With Lags

Check all features, one by one

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = add_lag_and_rolling_features(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  data.groupby('counter_id')


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,...,ww,etat_sol,tend,lag_1,lag_24,lag_168,rolling_mean_24h,rolling_std_24h,rolling_mean_7d,rolling_std_7d
169,28 boulevard Diderot O-E,2020,9,8,1,2,False,False,288.25,0.0,...,1,0.0,50,1.0,2.0,2.0,74.416667,70.794201,65.089286,58.821747
170,28 boulevard Diderot O-E,2020,9,8,1,3,False,False,288.25,0.0,...,1,0.0,50,4.0,2.0,7.0,74.625,70.578857,65.089286,58.821747
171,28 boulevard Diderot O-E,2020,9,8,1,4,False,False,288.25,0.0,...,1,0.0,50,7.0,4.0,10.0,74.875,70.327997,65.089286,58.821747
172,28 boulevard Diderot O-E,2020,9,8,1,5,False,False,288.25,0.0,...,1,0.0,50,10.0,48.0,48.0,75.416667,70.161873,65.166667,58.807681
173,28 boulevard Diderot O-E,2020,9,8,1,6,False,False,288.25,0.0,...,1,0.0,50,61.0,133.0,95.0,76.125,70.850923,65.494048,59.126982


In [9]:
weather_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", 
                       "hour", "is_weekend", "IsHoliday"]
lag_columns = ["lag_1", "lag_24", "lag_168", "rolling_mean_24h", "rolling_std_24h", 
               "rolling_mean_7d", "rolling_std_7d"]

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy='median'), weather_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
}

for i in range(0, len(lag_columns)):
    current_lag_columns = [lag_columns[i]]  # Add one lag column at a time
    print(f"\nEvaluating with lag columns: {current_lag_columns}")
    
    # Update the preprocessor to include the current lag columns
    preprocessor = ColumnTransformer([
        ("num", SimpleImputer(strategy='median'), weather_columns + current_lag_columns),
        ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ])
    
    for regressor_name, regressor in regressors.items():
        # Define the pipeline
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", regressor)
        ])
        
        # Perform cross-validation
        scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                                 n_jobs=-1, scoring="neg_root_mean_squared_error")
        
        # Print results
        print(f"{regressor_name} - RMSE: {abs(scores.mean()):.5f} ± {scores.std():.3f}")


Evaluating with lag columns: ['lag_1']
XG Boost - RMSE: 0.46668 ± 0.040
LightGBM - RMSE: 0.46949 ± 0.035

Evaluating with lag columns: ['lag_24']
XG Boost - RMSE: 0.50796 ± 0.041
LightGBM - RMSE: 0.50793 ± 0.035

Evaluating with lag columns: ['lag_168']
XG Boost - RMSE: 0.50734 ± 0.056
LightGBM - RMSE: 0.50976 ± 0.050

Evaluating with lag columns: ['rolling_mean_24h']
XG Boost - RMSE: 0.52590 ± 0.037
LightGBM - RMSE: 0.55538 ± 0.031

Evaluating with lag columns: ['rolling_std_24h']
XG Boost - RMSE: 0.59439 ± 0.047
LightGBM - RMSE: 0.60560 ± 0.039

Evaluating with lag columns: ['rolling_mean_7d']
XG Boost - RMSE: 0.53918 ± 0.042
LightGBM - RMSE: 0.57278 ± 0.034

Evaluating with lag columns: ['rolling_std_7d']
XG Boost - RMSE: 0.57369 ± 0.055
LightGBM - RMSE: 0.60359 ± 0.047


# Submission

In [4]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         1326
ht_neige    7232
raf10       1650
etat_sol    8022
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,Covid-19,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 8 avenue de la porte de Charenton NO-SE,2020,9,1,1,1,False,False,0,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,Voie Georges Pompidou NE-SO,2020,9,1,1,1,False,False,0,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,20 Avenue de Clichy NO-SE,2020,9,1,1,1,False,False,0,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Pont des Invalides S-N,2020,9,1,1,1,False,False,0,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,0,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [5]:
X_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_test = codify_date_2(X_test)
X_test = covid_19(X_test)
X_test = add_weather(X_test)
X_test = handle_missing_values(X_test, "linear")

X_test.head()

Columns with missing values and their counts:
rr1         165
ht_neige    660
etat_sol    498
dtype: int64


Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,...,Covid-19,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,...,0,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
1,100036718-103036718,39 quai François Mauriac SE-NO,100036718,39 quai François Mauriac,2021-09-10,2017-07-12,"48.83436,2.377",Y2H17021629,48.83436,2.377,...,0,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
2,100057380-104057380,Totem Cours la Reine E-O,100057380,Totem Cours la Reine,2021-09-10,2020-02-11,"48.86462,2.31444",YTH19111509,48.86462,2.31444,...,0,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
3,100056223-SC,Pont des Invalides N-S,100056223,Pont des Invalides N-S,2021-09-10,2019-11-07,"48.86284,2.310345",Y2H19070365,48.86284,2.310345,...,0,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0
4,100056226-104056226,Face au 8 avenue de la porte de Charenton NO-SE,100056226,Face au 8 avenue de la porte de Charenton,2021-09-10,2019-11-01,"48.830331,2.400551",Y2H19070370,48.830331,2.400551,...,0,291.45,0.0,94,0.0,3.4,2.0,0,1.0,0


In [6]:
# Save original index
X_original = pd.read_parquet(Path("data") / "final_test.parquet")
X_original.loc[:, "index1"] = X_original.index
X_original = codify_date_2(X_original)

# Merge DataFrames
X_test.loc[:, "index2"] = X_test.index
merged_df = X_test.merge(X_original, on=["datetime", "counter_name"], how="left")

merged_df = merged_df.sort_values("index1")
display(merged_df[["index1", "index2"]])


Unnamed: 0,index1,index2
0,0,0
719,1,719
925,2,925
1012,3,1012
1184,4,1184
...,...,...
50829,51435,50829
51103,51436,51103
51166,51437,51166
51270,51438,51270


In [7]:
X_test = X_test.drop(columns=["counter_id", "site_id", "site_name", 
                                "counter_installation_date", 
                              "coordinates", "counter_technical_id",
                              "latitude", "longitude", "datetime", "date"])

In [8]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
    ("hour_sin", sin_transformer(24), ["hour"]),
    ("hour_cos", cos_transformer(24), ["hour"]),
], remainder='passthrough')

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor())
])

# Fit the pipeline to the training data
pipeline.fit(X, y)

y_pred = pipeline.predict(X_test)
y_pred = y_pred[merged_df["index2"]]

results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)
results.to_csv("submission.csv", index=False)



AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
#dasda