In [1]:
from pathlib import Path
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from vacances_scolaires_france import SchoolHolidayDates
from jours_feries_france import JoursFeries
from datetime import datetime, date
import seaborn as sns
import matplotlib.pyplot as plt
from astral import LocationInfo
from astral.sun import daylight

In [2]:
from problem import get_train_data, get_test_data

In [4]:
data = get_train_data()

In [9]:
data[0]

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,counter_technical_id,latitude,longitude
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,Y2H19070373,48.840801,2.333233
87516,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.853720,2.357020
98518,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,Y2H19027732,48.853720,2.357020
875137,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,Y2H20073268,48.885290,2.326660
...,...,...,...,...,...,...,...,...,...
794577,100057329-103057329,Totem 85 quai d'Austerlitz SE-NO,100057329,Totem 85 quai d'Austerlitz,2021-08-09 23:00:00,2020-02-18,YTH19111508,48.842010,2.367290
804787,100057380-104057380,Totem Cours la Reine E-O,100057380,Totem Cours la Reine,2021-08-09 23:00:00,2020-02-11,YTH19111509,48.864620,2.314440
814377,100057380-103057380,Totem Cours la Reine O-E,100057380,Totem Cours la Reine,2021-08-09 23:00:00,2020-02-11,YTH19111509,48.864620,2.314440
125899,100042374-110042374,Voie Georges Pompidou NE-SO,100042374,Voie Georges Pompidou,2021-08-09 23:00:00,2017-12-15,Y2H21025335,48.848400,2.275860


In [5]:
def _is_daylight(x):

    city=LocationInfo('Paris', timezone='Europe/Paris')
    sun_info = daylight(city.observer, date=x.to_pydatetime().date(), tzinfo='Europe/Paris')
    x = x.tz_localize('Europe/Paris', ambiguous=True, nonexistent='shift_forward')
    return (x > sun_info[0]) & (x < sun_info[1])

In [6]:
def _feature_engineering(data):
    data = data.copy()
    d = SchoolHolidayDates()
    jf = JoursFeries()
    data['hour'] = data.date.dt.hour
    data['weekday'] = data.date.dt.weekday
    data['dom'] = data.date.dt.day
    data['week'] = data.date.dt.isocalendar().week
    data['month'] = data.date.dt.month
    data['year'] = data.date.dt.year
    data['dom_counter'] = data.counter_installation_date.dt.day
    data['month_counter'] = data.counter_installation_date.dt.month
    data['year_counter'] = data.counter_installation_date.dt.year
    data['date_datetime'] = data.date.map(lambda x: x.to_pydatetime().date())
    data['is_ferie'] = data.date_datetime.map(lambda x: jf.is_bank_holiday(x, zone='Métropole'))
    data['is_holiday'] = data.date_datetime.map(lambda x: d.is_holiday_for_zone(x, 'C'))
    data.drop(columns=['counter_name', 'site_name', 'counter_technical_id', 'counter_installation_date', 'date_datetime'], inplace=True)
    data['is_daylight'] = data.date.map(_is_daylight)
    external_data = pd.read_csv("./submissions/external_data/external_data.csv")
    external_data.cl = external_data.cl.fillna(value=100)
    external_data.cm = external_data.cm.fillna(value=100)
    external_data.ch = external_data.ch.fillna(value=100)
    external_data.ssfrai = external_data.ssfrai.fillna(value=0.0)
    external_data.perssfrai = external_data.perssfrai.fillna(value=0.0)
    external_data.dropna(axis=1, thresh=3000, inplace=True)
    external_data.fillna(method='ffill', inplace=True)
    external_data['datetime_date'] = pd.to_datetime(external_data.date)
    external_data.drop(columns=['numer_sta', 'per', 'pres'], inplace=True)
    ext_index = external_data.set_index('datetime_date')
    ext_index.sort_index(inplace=True)
    data_index = data.set_index('date')
    data_index.sort_index(inplace=True)
    merged_data = pd.merge_asof(data_index, ext_index, left_index=True, right_index=True)
    merged_data['is_confinement_1'] = (merged_data.date > '2020-10-30') & (merged_data.date < '2020-12-15')
    merged_data['is_confinement_2'] = (merged_data.date > '2021-04-03') & (merged_data.date < '2021-05-03')

    return merged_data

In [10]:
merged_data = _feature_engineering(data[0])

FileNotFoundError: [Errno 2] No such file or directory: './submissions/external_data/external_data.csv'

In [6]:
merged_data.shape

(455163, 50)

In [8]:
categorical_columns = ['counter_id', 'site_id', 'is_ferie', 'is_holiday', 'is_confinement_1', 'is_confinement_2', 'hour', 'weekday', 'month', 'year', 'is_daylight', 'cm', 'cl', 'ch']
numerical_columns = ['latitude',
                        'longitude',
                        'dom',
                        'week',
                        'dom_counter',
                        'month_counter',
                        'year_counter',
                        'pmer',
                        'tend',
                        'cod_tend',
                        'dd',
                        'ff',
                        't',
                        'td',
                        'u',
                        'vv',
                        'ww',
                        'w1',
                        'w2',
                        'n',
                        'nbas',
                        'tend24',
                        'raf10',
                        'rafper',
                        'etat_sol',
                        'ht_neige',
                        'ssfrai',
                        'perssfrai',
                        'rr1',
                        'rr3',
                        'rr6',
                        'rr12',
                        'rr24']

preprocessor = ColumnTransformer([('one_hot_encoder', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_columns),
                                ('standard_scaler', StandardScaler(), numerical_columns)])

test = preprocessor.fit_transform(merged_data)

test.shape

(455163, 193)

In [9]:
feature_engineering = FunctionTransformer(_feature_engineering)

pipe = make_pipeline(feature_engineering, preprocessor)

test_2 = pipe.fit_transform(data)

test_2.shape

(455163, 193)

In [12]:
len(categorical_columns)

14

In [13]:
len(numerical_columns)

33

In [14]:
merged_data.columns

Index(['counter_id', 'site_id', 'bike_count', 'latitude', 'longitude',
       'log_bike_count', 'hour', 'weekday', 'dom', 'week', 'month', 'year',
       'dom_counter', 'month_counter', 'year_counter', 'is_ferie',
       'is_holiday', 'is_daylight', 'date', 'pmer', 'tend', 'cod_tend', 'dd',
       'ff', 't', 'td', 'u', 'vv', 'ww', 'w1', 'w2', 'n', 'nbas', 'cl', 'cm',
       'ch', 'tend24', 'raf10', 'rafper', 'etat_sol', 'ht_neige', 'ssfrai',
       'perssfrai', 'rr1', 'rr3', 'rr6', 'rr12', 'rr24', 'is_confinement_1',
       'is_confinement_2'],
      dtype='object')

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from vacances_scolaires_france import SchoolHolidayDates
from jours_feries_france import JoursFeries
from datetime import datetime, date
import seaborn as sns
import matplotlib.pyplot as plt
from astral import LocationInfo
from astral.sun import daylight

def _is_daylight(x):

    city=LocationInfo('Paris', timezone='Europe/Paris')
    sun_info = daylight(city.observer, date=x.to_pydatetime().date(), tzinfo='Europe/Paris')
    x = x.tz_localize('Europe/Paris', ambiguous=True, nonexistent='shift_forward')
    return (x > sun_info[0]) & (x < sun_info[1])

def _feature_engineering(data):
    data = data.copy()
    d = SchoolHolidayDates()
    jf = JoursFeries()
    data['hour'] = data.date.dt.hour
    data['weekday'] = data.date.dt.weekday
    data['dom'] = data.date.dt.day
    data['week'] = data.date.dt.isocalendar().week
    data['month'] = data.date.dt.month
    data['year'] = data.date.dt.year
    data['dom_counter'] = data.counter_installation_date.dt.day
    data['month_counter'] = data.counter_installation_date.dt.month
    data['year_counter'] = data.counter_installation_date.dt.year
    data['date_datetime'] = data.date.map(lambda x: x.to_pydatetime().date())
    data['is_ferie'] = data.date_datetime.map(lambda x: jf.is_bank_holiday(x, zone='Métropole'))
    data['is_holiday'] = data.date_datetime.map(lambda x: d.is_holiday_for_zone(x, 'C'))
    data.drop(columns=['counter_name', 'site_name', 'counter_technical_id', 'counter_installation_date', 'date_datetime'], inplace=True)
    data['is_daylight'] = data.date.map(_is_daylight)
    external_data = pd.read_csv("./submissions/external_data/external_data.csv")
    external_data.cl = external_data.cl.fillna(value=100)
    external_data.cm = external_data.cm.fillna(value=100)
    external_data.ch = external_data.ch.fillna(value=100)
    external_data.ssfrai = external_data.ssfrai.fillna(value=0.0)
    external_data.perssfrai = external_data.perssfrai.fillna(value=0.0)
    external_data.dropna(axis=1, thresh=3000, inplace=True)
    external_data.fillna(method='ffill', inplace=True)
    external_data['datetime_date'] = pd.to_datetime(external_data.date)
    external_data.drop(columns=['numer_sta', 'per', 'pres'], inplace=True)
    ext_index = external_data.set_index('datetime_date')
    ext_index.sort_index(inplace=True)
    data_index = data.set_index('date')
    data_index.sort_index(inplace=True)
    merged_data = pd.merge_asof(data_index, ext_index, left_index=True, right_index=True)
    merged_data['is_confinement_1'] = (merged_data.date > '2020-10-30') & (merged_data.date < '2020-12-15')
    merged_data['is_confinement_2'] = (merged_data.date > '2021-04-03') & (merged_data.date < '2021-05-03')

    return merged_data

def get_estimator():
    feature_engineering=FunctionTransformer(_feature_engineering)

    categorical_columns = ['counter_id', 'site_id', 'is_ferie', 'is_holiday', 'is_confinement_1', 'is_confinement_2', 'hour', 'weekday', 'month', 'year', 'is_daylight', 'cm', 'cl', 'ch']
    numerical_columns = ['latitude',
                        'longitude',
                        'dom',
                        'week',
                        'dom_counter',
                        'month_counter',
                        'year_counter',
                        'pmer',
                        'tend',
                        'cod_tend',
                        'dd',
                        'ff',
                        't',
                        'td',
                        'u',
                        'vv',
                        'ww',
                        'w1',
                        'w2',
                        'n',
                        'nbas',
                        'tend24',
                        'raf10',
                        'rafper',
                        'etat_sol',
                        'ht_neige',
                        'ssfrai',
                        'perssfrai',
                        'rr1',
                        'rr3',
                        'rr6',
                        'rr12',
                        'rr24']

    preprocessor = ColumnTransformer([('one_hot_encoder', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_columns),
                                ('standard_scaler', StandardScaler(), numerical_columns)])

    pipe = make_pipeline(feature_engineering, preprocessor)

    

In [2]:
data = pd.read_parquet(Path('data') / 'train.parquet')

In [11]:
feature_engineering=FunctionTransformer(_feature_engineering)

categorical_columns = ['counter_id', 'site_id', 'is_ferie', 'is_holiday', 'is_confinement_1', 'is_confinement_2', 'hour', 'weekday', 'month', 'year', 'is_daylight', 'cm', 'cl', 'ch']
numerical_columns = ['latitude',
                    'longitude',
                    'dom',
                    'week',
                    'dom_counter',
                    'month_counter',
                    'year_counter',
                    'pmer',
                    'tend',
                    'cod_tend',
                    'dd',
                    'ff',
                    't',
                    'td',
                    'u',
                    'vv',
                    'ww',
                    'w1',
                    'w2',
                    'n',
                    'nbas',
                    'tend24',
                    'raf10',
                    'rafper',
                    'etat_sol',
                    'ht_neige',
                    'ssfrai',
                    'perssfrai',
                    'rr1',
                    'rr3',
                    'rr6',
                    'rr12',
                    'rr24']

preprocessor = ColumnTransformer([('one_hot_encoder', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_columns),
                            ('standard_scaler', StandardScaler(), numerical_columns)])

inputs = keras.Input(shape=(192,)) #TO BE CHANGED
dense = layers.Dense(256, activation='relu')
x = dense(inputs)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dense(64, activation='relu')(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs=inputs, outputs=outputs, name='bikes_deep')
model.compile(optimizer='adam', loss='mean_squared_error')

pipe = make_pipeline(feature_engineering, preprocessor, model)

In [13]:
test = pipe.fit(data)

ValueError: in user code:

    File "C:\Users\natha\anaconda3\envs\bikes-ramp\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\natha\anaconda3\envs\bikes-ramp\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\natha\anaconda3\envs\bikes-ramp\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\natha\anaconda3\envs\bikes-ramp\lib\site-packages\keras\engine\training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\natha\anaconda3\envs\bikes-ramp\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\natha\anaconda3\envs\bikes-ramp\lib\site-packages\keras\engine\input_spec.py", line 263, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "bikes_deep" is incompatible with the layer: expected shape=(None, 192), found shape=(None, 193)


In [17]:
np.array(test[:, 0]).shape

(455163,)