In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow import keras
from tensorflow.keras import layers
from vacances_scolaires_france import SchoolHolidayDates
from jours_feries_france import JoursFeries
from datetime import datetime, date
import seaborn as sns
import matplotlib.pyplot as plt
from astral import LocationInfo
from astral.sun import daylight

In [2]:
def _provide_date_info(data):

    data = data.copy()
    d = SchoolHolidayDates()
    jf = JoursFeries()
    data['hour'] = data.date.dt.hour
    data['weekday'] = data.date.dt.weekday
    data['dom'] = data.date.dt.day
    data['week'] = data.date.dt.isocalendar().week
    data['month'] = data.date.dt.month
    data['year'] = data.date.dt.year
    data['dom_counter'] = data.counter_installation_date.dt.day
    data['month_counter'] = data.counter_installation_date.dt.month
    data['year_counter'] = data.counter_installation_date.dt.year
    data['date_datetime'] = data.date.map(lambda x: x.to_pydatetime().date())
    data['is_ferie'] = data.date_datetime.map(lambda x: jf.is_bank_holiday(x, zone='Métropole'))
    data['is_holiday'] = data.date_datetime.map(lambda x: d.is_holiday_for_zone(x, 'C'))
    data.drop(columns=['counter_name', 'site_name', 'counter_technical_id', 'counter_installation_date', 'date_datetime'], inplace=True)

    return data

In [3]:
def _is_daylight(x):

    city=LocationInfo('Paris', timezone='Europe/Paris')
    sun_info = daylight(city.observer, date=x.to_pydatetime().date(), tzinfo='Europe/Paris')
    x = x.tz_localize('Europe/Paris', ambiguous=True, nonexistent='shift_forward')
    return (x > sun_info[0]) & (x < sun_info[1])

def _provide_daylight_info(data):

    data = data.copy()
    data['is_daylight'] = data.date.map(_is_daylight)

    return data

In [4]:
def _clean_data(data):

    data = data.copy()
    data_grouped = data.groupby(by=['counter_id', 'date_datetime']).sum()
    counter_down = data_grouped[data_grouped.bike_count == 0].reset_index()
    counter_down = counter_down['counter_id'].astype('str') + counter_down['date_datetime'].astype('str')
    counter_down = list(counter_down)
    data['down_test'] = data['counter_id'].astype('str') + data['date_datetime'].astype('str')
    data['is_down'] = data.down_test.map(lambda x: x in counter_down)
    data = data[data.is_down==False]
    data.drop(columns=['is_down', 'down_test', 'date_datetime'], inplace=True)

    return data

In [5]:
def _add_external_data(data):

    data = data.copy()
    external_data = pd.read_csv("./submissions/external_data/external_data.csv")
    external_data.cl = external_data.cl.fillna(value=100)
    external_data.cm = external_data.cm.fillna(value=100)
    external_data.ch = external_data.ch.fillna(value=100)
    external_data.ssfrai = external_data.ssfrai.fillna(value=0.0)
    external_data.perssfrai = external_data.perssfrai.fillna(value=0.0)
    external_data.dropna(axis=1, thresh=3000, inplace=True)
    external_data.fillna(method='ffill', inplace=True)
    external_data['datetime_date'] = pd.to_datetime(external_data.date)
    external_data.drop(columns=['numer_sta', 'per', 'pres'], inplace=True)
    ext_index = external_data.set_index('datetime_date')
    ext_index.sort_index(inplace=True)
    data_index = data.set_index('date')
    data_index.sort_index(inplace=True)
    merged_data = pd.merge_asof(data_index, ext_index, left_index=True, right_index=True)
    merged_data['is_confinement_1'] = (merged_data.date > '2020-10-30') & (merged_data.date < '2020-12-15')
    merged_data['is_confinement_2'] = (merged_data.date > '2021-04-03') & (merged_data.date < '2021-05-03')

    return merged_data

In [6]:
data = pd.read_parquet(Path('data') / 'train.parquet')

In [7]:
data = _provide_date_info(data)

In [9]:
data = _provide_daylight_info(data)

In [11]:
data = _add_external_data(data)

In [12]:
categorical_columns = ['counter_id', 'site_id', 'is_ferie', 'is_holiday', 'is_confinement_1', 'is_confinement_2', 'hour', 'weekday', 'month', 'year', 'is_daylight', 'cm', 'cl', 'ch']
numerical_columns = ['latitude',
                        'longitude',
                        'dom',
                        'week',
                        'dom_counter',
                        'month_counter',
                        'year_counter',
                        'pmer',
                        'tend',
                        'cod_tend',
                        'dd',
                        'ff',
                        't',
                        'td',
                        'u',
                        'vv',
                        'ww',
                        'w1',
                        'w2',
                        'n',
                        'nbas',
                        'tend24',
                        'raf10',
                        'rafper',
                        'etat_sol',
                        'ht_neige',
                        'ssfrai',
                        'perssfrai',
                        'rr1',
                        'rr3',
                        'rr6',
                        'rr12',
                        'rr24']
preprocessor = ColumnTransformer([('one_hot_encoder', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_columns),
                                ('standard_scaler', StandardScaler(), numerical_columns)])


In [13]:
test = preprocessor.fit_transform(data)

In [14]:
test.shape

(455163, 193)

In [15]:
data = pd.read_parquet(Path('data') / 'train.parquet')

provide_date_info = FunctionTransformer(_provide_date_info)
provide_daylight_info = FunctionTransformer (_provide_daylight_info)
clean_data = FunctionTransformer (_clean_data)
add_external_data = FunctionTransformer(_add_external_data)

pipe = make_pipeline(provide_date_info, provide_daylight_info, add_external_data)

test_2 = pipe.fit_transform(data)

In [16]:
test_2

Unnamed: 0_level_0,counter_id,site_id,bike_count,latitude,longitude,log_bike_count,hour,weekday,dom,week,...,ht_neige,ssfrai,perssfrai,rr1,rr3,rr6,rr12,rr24,is_confinement_1,is_confinement_2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-09-01 01:00:00,100056332-104056332,100056332,0.0,48.83848,2.37587,0.000000,1,1,1,36,...,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,False,False
2020-09-01 01:00:00,100047547-104047547,100047547,4.0,48.82636,2.30303,1.609438,1,1,1,36,...,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,False,False
2020-09-01 01:00:00,100047547-103047547,100047547,2.0,48.82636,2.30303,1.098612,1,1,1,36,...,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,False,False
2020-09-01 01:00:00,100057380-103057380,100057380,0.0,48.86462,2.31444,0.000000,1,1,1,36,...,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,False,False
2020-09-01 01:00:00,100047548-103047548,100047548,2.0,48.89141,2.38482,1.098612,1,1,1,36,...,0.0,0.0,-60.0,0.0,0.0,0.0,0.0,0.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-09 23:00:00,100057329-103057329,100057329,16.0,48.84201,2.36729,2.833213,23,0,9,32,...,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,False,False
2021-08-09 23:00:00,100060178-101060178,100060178,7.0,48.84638,2.31529,2.079442,23,0,9,32,...,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,False,False
2021-08-09 23:00:00,100056335-104056335,100056335,8.0,48.86288,2.31179,2.197225,23,0,9,32,...,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,False,False
2021-08-09 23:00:00,100042374-109042374,100042374,3.0,48.84840,2.27586,1.386294,23,0,9,32,...,0.0,0.0,-30.0,0.0,0.0,0.0,0.0,-0.1,False,False
