# Model Testing

In [11]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from feature_engineering import codify_date, codify_date_2, remove_outliers, get_X_y, covid_19, covid_19_2
from feature_engineering import add_weather, add_lag_and_rolling_features
from utils import handle_missing_values
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn import linear_model
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
data_test = pd.read_parquet(Path("data") / "final_test.parquet")

### Simplest Model
Variables: Date (hour, weekdaay, daymonth, IsHoliday), Counter Name

Outliers: Included

In [None]:
mdata = codify_date(data)
X, y = get_X_y(mdata)
X.head()

Unnamed: 0,counter_name,hour,weekday,daymonth,IsHoliday
48321,28 boulevard Diderot E-O,2,1,01_9,False
48324,28 boulevard Diderot E-O,3,1,01_9,False
48327,28 boulevard Diderot E-O,4,1,01_9,False
48330,28 boulevard Diderot E-O,15,1,01_9,False
48333,28 boulevard Diderot E-O,18,1,01_9,False


In [33]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "Lasso": linear_model.Lasso(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.95475 +- 0.163
XG Boost - The accuracy is: -0.96059 +- 0.162
Lasso - The accuracy is: -1.67937 +- 0.135
LightGBM - The accuracy is: -0.95777 +- 0.162


### Simplest Model (different Date Codification)


In [3]:
mdata = codify_date_2(data)
X, y = get_X_y(mdata)
X.head()

Unnamed: 0,counter_name,year,month,day,day_of_week,hour,minute,is_weekend,IsHoliday
48321,28 boulevard Diderot E-O,2020,9,1,1,2,0,False,False
48324,28 boulevard Diderot E-O,2020,9,1,1,3,0,False,False
48327,28 boulevard Diderot E-O,2020,9,1,1,4,0,False,False
48330,28 boulevard Diderot E-O,2020,9,1,1,15,0,False,False
48333,28 boulevard Diderot E-O,2020,9,1,1,18,0,False,False


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.92597 +- 0.175
XG Boost - The accuracy is: -0.91152 +- 0.176
LightGBM - The accuracy is: -0.92558 +- 0.175


### Without Outliers

In [3]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.85885 +- 0.119
XG Boost - The accuracy is: -0.83952 +- 0.122
LightGBM - The accuracy is: -0.85560 +- 0.121


### Without Outliers, With 1/0 Covid-19 Column

In [3]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,Covid-19
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False,0
1,28 boulevard Diderot E-O,2020,9,1,1,3,False,False,0
2,28 boulevard Diderot E-O,2020,9,1,1,4,False,False,0
3,28 boulevard Diderot E-O,2020,9,1,1,15,False,False,0
4,28 boulevard Diderot E-O,2020,9,1,1,18,False,False,0


In [4]:
regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }

for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.85981 +- 0.119
XG Boost - The accuracy is: -0.83691 +- 0.124
LightGBM - The accuracy is: -0.85730 +- 0.122


### Without Outliers, With Index for Covid-19

In [4]:
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = covid_19_2(mdata)
mdata.isnull().sum()
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])
  covid_19_index = pd.read_csv(Path("data") / "Covid_19_Index.csv")


Unnamed: 0,StringencyIndex_Average,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday
0,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
1,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
2,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
3,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False
4,46.76,28 boulevard Diderot E-O,2020,9,1,1,0,False,False


In [None]:

numerical_columns = ["StringencyIndex_Average"]
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -1.56809 +- 0.090
XG Boost - The accuracy is: -1.57179 +- 0.088
LightGBM - The accuracy is: -1.57015 +- 0.092


### With Weather Data

Outliers: Removed

Missing Values: Linear Interpolation

In [3]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         3984
ht_neige    5232
raf10       2640
etat_sol    5232
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 48 quai de la marne NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,6 rue Julia Bartet SO-NE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,18 quai de l'Hôtel de Ville SE-NO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Face au 4 avenue de la porte de Bagnolet O-E,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [4]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.76020 +- 0.100
XG Boost - The accuracy is: -0.74025 +- 0.106
LightGBM - The accuracy is: -0.75816 +- 0.096


Outliers: Included

Missing Values: Linear Interpolation

In [6]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "linear")
X, y = get_X_y(mdata)
X.head()

Columns with missing values and their counts:
rr1         4032
ht_neige    5376
raf10       2688
etat_sol    5376
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,27 quai de la Tournelle SE-NO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,90 Rue De Sèvres SO-NE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,152 boulevard du Montparnasse O-E,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Face au 25 quai de l'Oise NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,Pont de la Concorde S-N,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [7]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.82307 +- 0.095
XG Boost - The accuracy is: -0.80880 +- 0.093
LightGBM - The accuracy is: -0.81815 +- 0.095


Outliers: Removed

Missing Values: Spline Interpolation

In [8]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
mdata = handle_missing_values(mdata, "spline")
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Columns with missing values and their counts:
rr1         3984
ht_neige    5232
raf10       2640
etat_sol    5232
dtype: int64


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,Face au 48 quai de la marne NE-SO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,6 rue Julia Bartet SO-NE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,18 quai de l'Hôtel de Ville SE-NO,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Face au 4 avenue de la porte de Bagnolet O-E,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,39 quai François Mauriac NO-SE,2020,9,1,1,1,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [None]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.75595 +- 0.107
XG Boost - The accuracy is: -0.74025 +- 0.106
LightGBM - The accuracy is: -0.75816 +- 0.096


Outliers: Removed

Missing Values: Median

In [10]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
X, y = get_X_y(mdata)
X.head()

  data.groupby(["counter_name", "date_truncated"])


Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
0,28 boulevard Diderot E-O,2020,9,1,1,2,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
1,Pont de la Concorde S-N,2020,9,1,1,14,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
2,Pont de la Concorde S-N,2020,9,1,1,13,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
3,Pont de la Concorde S-N,2020,9,1,1,12,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10
4,Pont de la Concorde S-N,2020,9,1,1,9,False,False,285.75,0.0,81,0.0,2.4,1.6,1,0.0,-10


In [14]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy='median'), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

regressors = {"Histogram-based Gradient Boosting": HistGradientBoostingRegressor(),
             "XG Boost": XGBRegressor(),
             "LightGBM": LGBMRegressor(verbosity=-1),
            }
for regressor_name, regressor in regressors.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", regressor)
    ])

    scores = cross_val_score(pipeline, X, y, cv=utils.get_cv(X, y), 
                             n_jobs=-1, scoring="neg_root_mean_squared_error")
    print(f"{regressor_name} - The accuracy is: {scores.mean():.5f} +- {scores.std():.3f}")

Histogram-based Gradient Boosting - The accuracy is: -0.70994 +- 0.113
XG Boost - The accuracy is: -0.68451 +- 0.138
LightGBM - The accuracy is: -0.70910 +- 0.117


In [None]:
data = pd.read_parquet(Path("data") / "train.parquet")
mdata = codify_date_2(data)
mdata = remove_outliers(mdata)
mdata = add_weather(mdata)
X, y = get_X_y(mdata)
X.head()

In [22]:
X_test = pd.read_parquet(Path("data") / "final_test.parquet")
X_test = codify_date_2(X_test)
X_test = add_weather(X_test)
X_test = X_test.drop(columns=["counter_id", "site_id", "site_name", 
                                "counter_installation_date", 
                              "coordinates", "counter_technical_id",
                              "latitude", "longitude", "datetime", "date"])
X_test.tail()

Unnamed: 0,counter_name,year,month,day,day_of_week,hour,is_weekend,IsHoliday,t,rr1,u,ht_neige,raf10,ff,ww,etat_sol,tend
51435,6 rue Julia Bartet NE-SO,2021,10,18,0,19,False,False,279.25,0.0,92,0.0,0.6,0.0,2,0.0,40
51436,6 rue Julia Bartet NE-SO,2021,10,18,0,16,False,False,279.25,0.0,92,0.0,0.6,0.0,2,0.0,40
51437,6 rue Julia Bartet NE-SO,2021,10,18,0,12,False,False,279.25,0.0,92,0.0,0.6,0.0,2,0.0,40
51438,6 rue Julia Bartet NE-SO,2021,10,18,0,5,False,False,279.25,0.0,92,0.0,0.6,0.0,2,0.0,40
51439,254 rue de Vaugirard SO-NE,2021,10,18,0,19,False,False,279.25,0.0,92,0.0,0.6,0.0,2,0.0,40


In [21]:
numerical_columns = ['t', 'rr1', 'u', 'ht_neige', 'raf10', 'ff', 'ww', 'etat_sol', 'tend']
categorical_columns = ["counter_name", "year", "month", "day", "day_of_week", "hour", "is_weekend", "IsHoliday"]

preprocessor = ColumnTransformer([
    ("num", SimpleImputer(strategy='median'), numerical_columns),
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor())
])

# Fit the pipeline to the training data
pipeline.fit(X, y)

# Make predictions on the test data
predictions = pipeline.predict(X_test)

# Save the predictions to a CSV file
predictions_df = pd.DataFrame(predictions, columns=["predictions"])
predictions_df.head()

# Save to CSV
predictions_df.to_csv("predictions_output.csv", index=True)