In [44]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from vacances_scolaires_france import SchoolHolidayDates
from jours_feries_france import JoursFeries
from datetime import datetime, date
import seaborn as sns
import matplotlib.pyplot as plt
from astral import LocationInfo
from astral.sun import daylight
from sklearn.kernel_ridge import KernelRidge

In [45]:
data = pd.read_parquet('../data/train.parquet')

In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455163 entries, 48321 to 928462
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 455163 non-null  category      
 1   counter_name               455163 non-null  category      
 2   site_id                    455163 non-null  int64         
 3   site_name                  455163 non-null  category      
 4   bike_count                 455163 non-null  float64       
 5   date                       455163 non-null  datetime64[ns]
 6   counter_installation_date  455163 non-null  datetime64[ns]
 7   counter_technical_id       455163 non-null  category      
 8   latitude                   455163 non-null  float64       
 9   longitude                  455163 non-null  float64       
 10  log_bike_count             455163 non-null  float64       
dtypes: category(4), datetime64[ns](2), float64(4), i

In [47]:
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,Y2H15027244,48.846028,2.375429,2.302585


In [48]:
d = SchoolHolidayDates()
jf = JoursFeries()
data['hour'] = data.date.dt.hour
data['weekday'] = data.date.dt.weekday
data['dom'] = data.date.dt.day
data['week'] = data.date.dt.isocalendar().week
data['month'] = data.date.dt.month
data['year'] = data.date.dt.year
data['dom_counter'] = data.counter_installation_date.dt.day
data['month_counter'] = data.counter_installation_date.dt.month
data['year_counter'] = data.counter_installation_date.dt.year
data['date_datetime'] = data.date.map(lambda x: x.to_pydatetime().date())
# data['is_ferie'] = data.date_datetime.map(lambda x: jf.is_bank_holiday(x, zone='Métropole'))
# data['is_holiday'] = data.date_datetime.map(lambda x: d.is_holiday_for_zone(x, 'C'))

In [49]:
def is_daylight(x):
    city=LocationInfo('Paris', timezone='Europe/Paris')
    sun_info = daylight(city.observer, date=x.to_pydatetime().date(), tzinfo='Europe/Paris')
    x = x.tz_localize('Europe/Paris', ambiguous=True, nonexistent='shift_forward')
    return (x > sun_info[0]) & (x < sun_info[1])

In [50]:
data['is_daylight'] = data.date.map(is_daylight)

In [51]:
data.drop(columns=['counter_name', 'site_name', 'counter_technical_id', 'counter_installation_date'], inplace=True)

In [52]:
external_data = pd.read_csv("external_data_reworked.csv", parse_dates=['date'])
external_data_grouped = external_data.groupby(by=['date_datetime']).sum()
jours_feries = external_data_grouped.is_ferie > 0
holidays = external_data_grouped.is_holiday > 0
counters_list = data.counter_id.unique()
down_counters = external_data_grouped[counters_list] > 0
days_down={}
for counter in counters_list :
    string_list = down_counters[down_counters[counter] == True].index.values.tolist()
    date_datetime_list = [pd.to_datetime(j).date() for j in string_list]
    days_down[counter] = date_datetime_list

data['is_down'] = data.apply(lambda x: x.date_datetime in days_down[x.counter_id], axis=1)
data = data[data['is_down'] == False]
data['is_ferie'] = data.date_datetime.map(lambda x: jours_feries[str(x)])
data['is_holiday'] = data.date_datetime.map(lambda x: holidays[str(x)])
data.drop(columns=['is_down', 'date_datetime'], inplace=True)

In [53]:
data

Unnamed: 0,counter_id,site_id,bike_count,date,latitude,longitude,log_bike_count,hour,weekday,dom,week,month,year,dom_counter,month_counter,year_counter,is_daylight,is_ferie,is_holiday
48321,100007049-102007049,100007049,0.0,2020-09-01 02:00:00,48.846028,2.375429,0.000000,2,1,1,36,9,2020,18,1,2013,False,False,False
48324,100007049-102007049,100007049,1.0,2020-09-01 03:00:00,48.846028,2.375429,0.693147,3,1,1,36,9,2020,18,1,2013,False,False,False
48327,100007049-102007049,100007049,0.0,2020-09-01 04:00:00,48.846028,2.375429,0.000000,4,1,1,36,9,2020,18,1,2013,False,False,False
48330,100007049-102007049,100007049,4.0,2020-09-01 15:00:00,48.846028,2.375429,1.609438,15,1,1,36,9,2020,18,1,2013,True,False,False
48333,100007049-102007049,100007049,9.0,2020-09-01 18:00:00,48.846028,2.375429,2.302585,18,1,1,36,9,2020,18,1,2013,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928450,300014702-353245971,300014702,51.0,2021-08-08 18:00:00,48.839770,2.301980,3.951244,18,6,8,31,8,2021,29,11,2020,True,False,True
928453,300014702-353245971,300014702,1.0,2021-08-09 02:00:00,48.839770,2.301980,0.693147,2,0,9,32,8,2021,29,11,2020,False,False,True
928456,300014702-353245971,300014702,61.0,2021-08-09 08:00:00,48.839770,2.301980,4.127134,8,0,9,32,8,2021,29,11,2020,True,False,True
928459,300014702-353245971,300014702,44.0,2021-08-09 10:00:00,48.839770,2.301980,3.806662,10,0,9,32,8,2021,29,11,2020,True,False,True


In [54]:
external_data.cl = external_data.cl.fillna(value=100)
external_data.cm = external_data.cm.fillna(value=100)
external_data.ch = external_data.ch.fillna(value=100)
external_data.ssfrai = external_data.ssfrai.fillna(value=0.0)
external_data.perssfrai = external_data.perssfrai.fillna(value=0.0)
external_data.dropna(axis=1, thresh=3000, inplace=True)
external_data.drop(columns=counters_list, inplace=True)
external_data.drop(columns=['numer_sta', 'per', 'pres', 'is_ferie', 'is_holiday'], inplace=True)
external_data.fillna(method='ffill', inplace=True)

In [55]:
external_data

Unnamed: 0,date,pmer,tend,cod_tend,dd,ff,t,td,u,vv,...,etat_sol,ht_neige,ssfrai,perssfrai,rr1,rr3,rr6,rr12,rr24,date_datetime
0,2021-01-01 00:00:00,100810,80,1,270,1.8,272.75,272.15,96,990,...,1.0,0.00,0.00,-60.0,0.0,0.0,0.0,0.0,2.0,2021-01-01
1,2021-01-01 03:00:00,100920,110,3,300,1.7,271.25,270.95,98,210,...,1.0,0.00,0.00,-30.0,0.0,0.0,0.0,0.0,1.2,2021-01-01
2,2021-01-01 06:00:00,100950,30,3,290,2.6,271.95,271.65,98,3660,...,1.0,0.00,0.00,-60.0,0.0,0.0,0.0,0.0,1.0,2021-01-01
3,2021-01-01 09:00:00,101100,150,2,280,1.7,272.45,272.05,97,3500,...,13.0,0.01,0.01,-30.0,0.0,0.2,0.2,0.2,0.2,2021-01-01
4,2021-01-01 12:00:00,101110,30,0,50,1.0,276.95,274.15,82,8000,...,11.0,-0.01,0.00,-60.0,0.0,0.0,0.2,0.2,0.2,2021-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,2020-09-30 09:00:00,101540,-30,8,230,4.4,289.95,286.85,82,18000,...,0.0,0.00,0.00,-30.0,0.0,0.0,0.0,0.0,2.2,2020-09-30
3318,2020-09-30 12:00:00,101320,-210,8,190,4.9,292.05,285.55,66,25000,...,0.0,0.00,0.00,-60.0,0.0,0.2,0.2,0.2,1.6,2020-09-30
3319,2020-09-30 15:00:00,101140,-180,7,190,4.1,291.55,286.45,72,25000,...,0.0,0.00,0.00,-30.0,0.0,0.0,0.2,0.2,0.2,2020-09-30
3320,2020-09-30 18:00:00,101020,-130,6,190,2.7,290.15,285.25,73,40820,...,0.0,0.00,0.00,-60.0,0.0,0.0,0.0,0.2,0.2,2020-09-30


In [56]:
ext_index = external_data.set_index('date')

In [57]:
ext_index.sort_index(inplace=True)

In [83]:
ext_index

Unnamed: 0_level_0,pmer,tend,cod_tend,dd,ff,t,td,u,vv,ww,...,etat_sol,ht_neige,ssfrai,perssfrai,rr1,rr3,rr6,rr12,rr24,date_datetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-09-01 00:00:00,102050,-10,8,340,1.6,285.75,282.55,81,30000,1,...,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 03:00:00,101990,-60,6,290,1.1,283.95,282.05,88,25000,2,...,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 06:00:00,102000,10,3,360,1.8,284.25,282.85,91,25000,3,...,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 09:00:00,101970,0,5,360,2.9,291.25,283.35,60,19830,1,...,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 12:00:00,101850,-110,6,40,2.6,293.95,281.25,44,21000,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-10-21 00:00:00,99800,50,0,230,11.8,287.05,281.45,69,19910,1,...,0.0,0.0,0.0,-60.0,0.0,2.0,2.0,2.0,2.6,2021-10-21
2021-10-21 03:00:00,100270,460,3,260,14.6,286.05,279.35,64,30000,60,...,0.0,0.0,0.0,-30.0,-0.1,-0.1,2.0,2.0,2.6,2021-10-21
2021-10-21 06:00:00,101000,720,1,240,7.7,284.05,279.95,76,30000,1,...,0.0,0.0,0.0,-60.0,0.0,-0.1,-0.1,2.0,2.6,2021-10-21
2021-10-21 09:00:00,101230,230,1,240,7.0,286.05,280.75,70,30000,1,...,0.0,0.0,0.0,-30.0,0.0,0.0,-0.1,2.0,2.0,2021-10-21


In [84]:
data_index

Unnamed: 0_level_0,counter_id,site_id,bike_count,latitude,longitude,log_bike_count,hour,weekday,dom,week,month,year,dom_counter,month_counter,year_counter,is_daylight,is_ferie,is_holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-09-01 01:00:00,100057445-103057445,100057445,2.0,48.86377,2.35096,1.098612,1,1,1,36,9,2020,11,2,2020,False,False,False
2020-09-01 01:00:00,100057329-103057329,100057329,4.0,48.84201,2.36729,1.609438,1,1,1,36,9,2020,18,2,2020,False,False,False
2020-09-01 01:00:00,100060178-102060178,100060178,21.0,48.84638,2.31529,3.091042,1,1,1,36,9,2020,22,7,2020,False,False,False
2020-09-01 01:00:00,100056332-103056332,100056332,2.0,48.83848,2.37587,1.098612,1,1,1,36,9,2020,11,12,2019,False,False,False
2020-09-01 01:00:00,100056335-104056335,100056335,9.0,48.86288,2.31179,2.302585,1,1,1,36,9,2020,7,11,2019,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-09 23:00:00,100056335-103056335,100056335,12.0,48.86288,2.31179,2.564949,23,0,9,32,8,2021,7,11,2019,False,False,True
2021-08-09 23:00:00,100042374-110042374,100042374,4.0,48.84840,2.27586,1.609438,23,0,9,32,8,2021,15,12,2017,False,False,True
2021-08-09 23:00:00,100056047-SC,100056047,10.0,48.86378,2.32003,2.397895,23,0,9,32,8,2021,8,11,2019,False,False,True
2021-08-09 23:00:00,100036719-103036719,100036719,3.0,48.85372,2.35702,1.386294,23,0,9,32,8,2021,12,7,2017,False,False,True


In [58]:
data_index = data.set_index('date')
data_index.sort_index(inplace=True)

In [59]:
merged_data = pd.merge_asof(data_index, ext_index, left_index=True, right_index=True)

In [60]:
merged_data

Unnamed: 0_level_0,counter_id,site_id,bike_count,latitude,longitude,log_bike_count,hour,weekday,dom,week,...,etat_sol,ht_neige,ssfrai,perssfrai,rr1,rr3,rr6,rr12,rr24,date_datetime
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-09-01 01:00:00,100057445-103057445,100057445,2.0,48.86377,2.35096,1.098612,1,1,1,36,...,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 01:00:00,100057329-103057329,100057329,4.0,48.84201,2.36729,1.609438,1,1,1,36,...,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 01:00:00,100060178-102060178,100060178,21.0,48.84638,2.31529,3.091042,1,1,1,36,...,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 01:00:00,100056332-103056332,100056332,2.0,48.83848,2.37587,1.098612,1,1,1,36,...,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
2020-09-01 01:00:00,100056335-104056335,100056335,9.0,48.86288,2.31179,2.302585,1,1,1,36,...,0.0,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,2020-09-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-09 23:00:00,100056335-103056335,100056335,12.0,48.86288,2.31179,2.564949,23,0,9,32,...,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,2021-08-09
2021-08-09 23:00:00,100042374-110042374,100042374,4.0,48.84840,2.27586,1.609438,23,0,9,32,...,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,2021-08-09
2021-08-09 23:00:00,100056047-SC,100056047,10.0,48.86378,2.32003,2.397895,23,0,9,32,...,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,2021-08-09
2021-08-09 23:00:00,100036719-103036719,100036719,3.0,48.85372,2.35702,1.386294,23,0,9,32,...,0.0,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,2021-08-09


In [61]:
merged_data['is_confinement_1'] = (merged_data.date_datetime > '2020-10-30') & (merged_data.date_datetime < '2020-12-15')
merged_data['is_confinement_2'] = (merged_data.date_datetime > '2021-04-03') & (merged_data.date_datetime < '2021-05-03')

In [86]:
# checking if the temperature values make sens after the merge
merged_grouped =merged_data.groupby(by='date_datetime').t.max()
sns.lineplot(data=merged_grouped)

In [63]:
feature = merged_data.drop(columns=['bike_count', 'log_bike_count', 'date_datetime'])
target = merged_data['log_bike_count']

In [66]:
categorical_columns = ['counter_id', 'site_id', 'is_ferie', 'is_holiday', 'is_confinement_1', 'is_confinement_2', 'hour', 'weekday', 'month', 'year', 'is_daylight', 'cm', 'cl', 'ch']
numerical_columns = [col for col in feature.columns if col not in categorical_columns]

In [67]:
categories = []
for col in categorical_columns :
    cats = feature[col].unique()
    if np.issubdtype(type(cats[0]), np.number):
        cats.sort()
    categories.append(cats)

categories

[['100057445-103057445', '100057329-103057329', '100060178-102060178', '100056332-103056332', '100056335-104056335', ..., '100056336-106056336', '100056335-103056335', '100063175-353277235', '300014702-353245971', '300014702-353245972']
 Length: 56
 Categories (56, object): ['100007049-101007049', '100007049-102007049', '100036718-103036718', '100036718-104036718', ..., '100063175-353277233', '100063175-353277235', '300014702-353245971', '300014702-353245972'],
 array([100007049, 100036718, 100036719, 100042374, 100044493, 100047542,
        100047545, 100047546, 100047547, 100047548, 100049407, 100050876,
        100056046, 100056047, 100056223, 100056226, 100056327, 100056329,
        100056330, 100056331, 100056332, 100056334, 100056335, 100056336,
        100057329, 100057380, 100057445, 100060178, 100063175, 300014702],
       dtype=int64),
 array([False,  True]),
 array([False,  True]),
 array([False,  True]),
 array([False,  True]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9

In [68]:
preprocessor = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories=categories, drop='first', sparse=False, handle_unknown='ignore'), categorical_columns),
                                ('standard_scaler', StandardScaler(), numerical_columns)])

In [69]:
inputs = keras.Input(shape=(193, ))
dense = layers.Dense(256, activation='relu')
x = dense(inputs)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1)(x)

In [70]:
model = keras.Model(inputs=inputs, outputs=outputs, name='bikes_deep')
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

Model: "bikes_deep"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 193)]             0         
                                                                 
 dense (Dense)               (None, 256)               49664     
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 90,881
Trainable params: 90,881
Non-trainable params: 0
_________________________________________________________________


In [71]:
features_train, features_test, target_train, target_test = train_test_split(feature, target, shuffle=False, train_size=0.95)

In [72]:
pipe = make_pipeline(preprocessor, model)

In [73]:
pipe.fit(features_train, target_train, functional__validation_split=0.05, functional__epochs=20, functional__batch_size=1000, functional__shuffle=False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one_hot_encoder',
                                                  OneHotEncoder(categories=[['100057445-103057445', '100057329-103057329', '100060178-102060178', '100056332-103056332', '100056335-104056335', ..., '100056336-106056336', '100056335-103056335', '100063175-353277235', '300014702-353245971', '300014702-353245972']
Length: 56
Ca...
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['latitude', 'longitude',
                                                   'dom', 'week', 'dom_counter',
                                                   'month_counter',
                                                   'year_counter', 'pmer',
                                                   'tend', 'cod_tend', 'dd',
                                                  

In [74]:
preds = pipe.predict(features_test)
score = mean_squared_error(target_test, preds, squared=False)
print(score)

0.7013268183344014


In [75]:
from keras.wrappers.scikit_learn import KerasRegressor

def build_model():
    inputs = keras.Input(shape=(193, ))
    dense = layers.Dense(256, activation='relu')
    x = dense(inputs)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs=inputs, outputs=outputs, name='bikes_deep')
    model.compile(optimizer='adam', loss='mean_squared_error')

    return model

In [76]:
model = KerasRegressor(build_model, validation_split=0.05, epochs=20, batch_size=1000, shuffle=False)

  model = KerasRegressor(build_model, validation_split=0.05, epochs=20, batch_size=1000, shuffle=False)


In [77]:
pipe_test = make_pipeline(preprocessor, model)
pipe_test.fit(features_train, target_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one_hot_encoder',
                                                  OneHotEncoder(categories=[['100057445-103057445', '100057329-103057329', '100060178-102060178', '100056332-103056332', '100056335-104056335', ..., '100056336-106056336', '100056335-103056335', '100063175-353277235', '300014702-353245971', '300014702-353245972']
Length: 56
Ca...
                                                  StandardScaler(),
                                                  ['latitude', 'longitude',
                                                   'dom', 'week', 'dom_counter',
                                                   'month_counter',
                                                   'year_counter', 'pmer',
                                                   'tend', 'cod_tend', 'dd',
                                                   'ff', 't', 'td', 'u', 'vv',
                                        

In [78]:
preds = pipe_test.predict(features_test)
mean_squared_error(preds, target_test, squared=False)

0.6554417770357207

In [79]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit

pipe_ridge = make_pipeline(preprocessor, Ridge())

pipe_ridge.fit(features_train, target_train)

preds = pipe_ridge.predict(features_test)

print(mean_squared_error(preds, target_test, squared=False))

0.7836710329876625
