# Import

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os
import time
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re
import random

from tqdm.notebook import tqdm

import dask.dataframe as dd
from dask.distributed import LocalCluster, Client

sys.path.insert(0, 'tools/')

from tools import * 

# Run DASK dist client 

In [2]:

cluster = LocalCluster()
client = Client(cluster)


Perhaps you already have a cluster running?
Hosting the HTTP server on port 45013 instead


In [3]:
cluster

0,1
Dashboard: http://127.0.0.1:45013/status,Workers: 5
Total threads: 20,Total memory: 31.03 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:40153,Workers: 5
Dashboard: http://127.0.0.1:45013/status,Total threads: 20
Started: Just now,Total memory: 31.03 GiB

0,1
Comm: tcp://127.0.0.1:37539,Total threads: 4
Dashboard: http://127.0.0.1:39951/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:34399,
Local directory: /tmp/dask-scratch-space/worker-mmyjyldi,Local directory: /tmp/dask-scratch-space/worker-mmyjyldi

0,1
Comm: tcp://127.0.0.1:39201,Total threads: 4
Dashboard: http://127.0.0.1:40293/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:39325,
Local directory: /tmp/dask-scratch-space/worker-pf52hc12,Local directory: /tmp/dask-scratch-space/worker-pf52hc12

0,1
Comm: tcp://127.0.0.1:43489,Total threads: 4
Dashboard: http://127.0.0.1:34397/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:46457,
Local directory: /tmp/dask-scratch-space/worker-4rz1q0k6,Local directory: /tmp/dask-scratch-space/worker-4rz1q0k6

0,1
Comm: tcp://127.0.0.1:41049,Total threads: 4
Dashboard: http://127.0.0.1:35989/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:46467,
Local directory: /tmp/dask-scratch-space/worker-on52f6an,Local directory: /tmp/dask-scratch-space/worker-on52f6an

0,1
Comm: tcp://127.0.0.1:39909,Total threads: 4
Dashboard: http://127.0.0.1:37487/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:45951,
Local directory: /tmp/dask-scratch-space/worker-5122mln0,Local directory: /tmp/dask-scratch-space/worker-5122mln0


In [4]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:45013/status,

0,1
Dashboard: http://127.0.0.1:45013/status,Workers: 5
Total threads: 20,Total memory: 31.03 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:40153,Workers: 5
Dashboard: http://127.0.0.1:45013/status,Total threads: 20
Started: Just now,Total memory: 31.03 GiB

0,1
Comm: tcp://127.0.0.1:37539,Total threads: 4
Dashboard: http://127.0.0.1:39951/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:34399,
Local directory: /tmp/dask-scratch-space/worker-mmyjyldi,Local directory: /tmp/dask-scratch-space/worker-mmyjyldi

0,1
Comm: tcp://127.0.0.1:39201,Total threads: 4
Dashboard: http://127.0.0.1:40293/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:39325,
Local directory: /tmp/dask-scratch-space/worker-pf52hc12,Local directory: /tmp/dask-scratch-space/worker-pf52hc12

0,1
Comm: tcp://127.0.0.1:43489,Total threads: 4
Dashboard: http://127.0.0.1:34397/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:46457,
Local directory: /tmp/dask-scratch-space/worker-4rz1q0k6,Local directory: /tmp/dask-scratch-space/worker-4rz1q0k6

0,1
Comm: tcp://127.0.0.1:41049,Total threads: 4
Dashboard: http://127.0.0.1:35989/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:46467,
Local directory: /tmp/dask-scratch-space/worker-on52f6an,Local directory: /tmp/dask-scratch-space/worker-on52f6an

0,1
Comm: tcp://127.0.0.1:39909,Total threads: 4
Dashboard: http://127.0.0.1:37487/status,Memory: 6.21 GiB
Nanny: tcp://127.0.0.1:45951,
Local directory: /tmp/dask-scratch-space/worker-5122mln0,Local directory: /tmp/dask-scratch-space/worker-5122mln0


In [5]:
import ctypes

def trim_memory() -> int:
    libc = ctypes.CDLL("libc.so.6")
    return libc.malloc_trim(0)

client.run(trim_memory)

{'tcp://127.0.0.1:37539': 1,
 'tcp://127.0.0.1:39201': 1,
 'tcp://127.0.0.1:39909': 1,
 'tcp://127.0.0.1:41049': 1,
 'tcp://127.0.0.1:43489': 1}

# Read and merge

In [6]:
config = pd.Series({
    'path': os.path.abspath('dades'),
    'file_type':'csv',
    'years':[2019, 2020, 2021, 2022, 2023],
    'dataset': 'BicingNou_ESTACIONS_MOD'
})

In [7]:
%%time

def read_data_all(cnfg:dict):

    data = dict()

    for year in cnfg.years:
        cnfg['year'] = year
        data[year] = load_checkpoint(cnfg)
    
    return dd.concat(list(data.values()), interleave_partitions=False)
    
bbdd_completa = read_data_all(config)   

checkpoint reloaded.
checkpoint reloaded.
checkpoint reloaded.
checkpoint reloaded.
checkpoint reloaded.
CPU times: user 62.4 ms, sys: 6.92 ms, total: 69.3 ms
Wall time: 68.6 ms


# Preprocessing

In [8]:
%%time

index=0
before = get_ddf_shape(bbdd_completa)
print(before)

index+=1
print(index)
# 1 Error fix. # 2019 gener, febrer and marc have the status reversed
cond = (bbdd_completa.year == 2019) & (bbdd_completa.month.isin([1,2,3]))
bbdd_completa['status'] = bbdd_completa['status'].mask(cond, ((bbdd_completa['status'] + 1) %2))
index+=1
print(index)
# 2 Remove not needed status, manitenance, planned. open = 0, closed = 1 
cond = (bbdd_completa['status'].isin([2.0,3.0]))
bbdd_completa['status'] = bbdd_completa['status'].mask(cond, 1.0)
# bbdd_completa = bbdd_completa.dropna(subset=['status'])
index+=1
print(index)
# 3 Remove data from 2020
# cond = (bbdd_completa['year'].isin([2020]))
# bbdd_completa = bbdd_completa.mask(cond, np.nan)
# bbdd_completa = bbdd_completa.dropna(subset=['year'])
# index+=1
# print(index)
# 4 2019 to 2020 to have continuos data 
# cond = (bbdd_completa['year'].isin([2019]))
# bbdd_completa['year'] = bbdd_completa['year'].mask(cond, bbdd_completa['year']+1)
# index+=1
# print(index)
# 5 status reversed 
bbdd_completa['status'] = (bbdd_completa['status'] + 1)%2
index+=1
print(index)
# 6 Column has all ones
bbdd_completa = bbdd_completa.drop(columns=['is_installed'])
index+=1
print(index)
# 7 Column has all ones
bbdd_completa = bbdd_completa.drop(columns=['is_charging_station'])
index+=1
print(index)
# 8 Remove row where status = closed and is renting and is returning
# cond = (bbdd_completa['status'].isin([0.0])) & (bbdd_completa['is_renting'].isin([1.0])) & (bbdd_completa['is_returning'].isin([1.0]))
# bbdd_completa = bbdd_completa.mask(cond, np.nan)
# bbdd_completa = bbdd_completa.dropna(subset=['status'])

after = get_ddf_shape(bbdd_completa)
print(after)

print('Changes to dataframe durinf preprocessing')
print(f'dropeed {(after[0]-before[0]):02d} rows')
print(f'dropped {(after[1]-before[1]):02d} columns')
# 14138178

(17707721, 23)
1
2
3
4
5
6
(17707721, 21)
Changes to dataframe durinf preprocessing
dropeed 00 rows
dropped -2 columns
CPU times: user 3.68 s, sys: 481 ms, total: 4.16 s
Wall time: 15.8 s


## Drop duplicate data

In [9]:
client.run(trim_memory)

{'tcp://127.0.0.1:37539': 1,
 'tcp://127.0.0.1:39201': 1,
 'tcp://127.0.0.1:39909': 1,
 'tcp://127.0.0.1:41049': 1,
 'tcp://127.0.0.1:43489': 1}

In [10]:
def drop_duplicates_per_year(ddf):
    years = ddf.year.unique()
    
    before = get_ddf_shape(ddf)
    print(before)
    
    data = list()
    
    for year in years:
        print(year)
        data_year = ddf[ddf.year.isin([year])]
        
        data_year = data_year.drop_duplicates(subset=['timestamp', 'station_id'], keep='last')
        
        data.append(data_year)
        
        client.run(trim_memory)
        
        time.sleep(1)
        
    ddf_clean = dd.concat(data, interleave_partitions=False)
    
    after = get_ddf_shape(ddf_clean)
    print(after)

    print('Changes to dataframe durinf preprocessing')
    print(f'dropeed {(after[0]-before[0]):02d} rows')
    print(f'dropped {(after[1]-before[1]):02d} columns')
    return ddf_clean


In [11]:
%%time

bbdd_completa = drop_duplicates_per_year(bbdd_completa)

(17707721, 21)
2019
2020
2021
2022
2023
(17561940, 21)
Changes to dataframe durinf preprocessing
dropeed -145781 rows
dropped 00 columns
CPU times: user 9.24 s, sys: 733 ms, total: 9.98 s
Wall time: 35.9 s


## Drop not needed data

In [12]:


before = get_ddf_shape(bbdd_completa)
print(before)
# remove data where status is closed
# remove data where status is open but is not returing nor renting
cond = (bbdd_completa['status'].isin([0]))
bbdd_completa['status'] = bbdd_completa['status'].mask(cond, np.nan)
bbdd_completa = bbdd_completa.dropna(subset=['status'])

cond = (bbdd_completa['status'].isin([1])) & (bbdd_completa['is_returning'].isin([0])) & (bbdd_completa['is_renting'].isin([0]))
bbdd_completa['status'] = bbdd_completa['status'].mask(cond, np.nan)
bbdd_completa = bbdd_completa.dropna(subset=['status'])

after = get_ddf_shape(bbdd_completa)
print(after)

print('Changes to dataframe durinf preprocessing')
print(f'dropeed {(after[0]-before[0]):02d} rows')
print(f'dropped {(after[1]-before[1]):02d} columns')


(17561940, 21)
(17103752, 21)
Changes to dataframe durinf preprocessing
dropeed -458188 rows
dropped 00 columns


## Drop not needed columns 

In [13]:
bbdd_completa = bbdd_completa.drop(
    columns=[
        'num_docks_available',
        'timestamp',
        'num_bikes_available_types.ebike',
        'num_bikes_available_types.mechanical',
        'num_bikes_available',
        'is_returning',
        'is_renting',
        'status'
    ]
)

# Additional datasets

## Festius 2019 - 2023 

In [14]:
client.run(trim_memory)

{'tcp://127.0.0.1:37539': 1,
 'tcp://127.0.0.1:39201': 1,
 'tcp://127.0.0.1:39909': 1,
 'tcp://127.0.0.1:41049': 1,
 'tcp://127.0.0.1:43489': 1}

In [15]:
df_calendari_festius = pd.read_csv('./Altres_dades/calendari_festius.csv',sep=';')

In [16]:
df_calendari_festius['fecha'] = pd.to_datetime(df_calendari_festius['fecha'], format='%d/%m/%y')
df_calendari_festius.dtypes

fecha              datetime64[ns]
festius                    object
festius_sun                object
festius_sun_sat            object
dtype: object

In [17]:
df_calendari_festius['year'] = df_calendari_festius['fecha'].dt.year
df_calendari_festius['month'] = df_calendari_festius['fecha'].dt.month
df_calendari_festius['day'] = df_calendari_festius['fecha'].dt.day

In [18]:
df_calendari_festius['dayofweek'] = df_calendari_festius['fecha'].dt.dayofweek

In [19]:
type(df_calendari_festius)

pandas.core.frame.DataFrame

In [20]:
df_calendari_festius.drop('fecha', axis=1, inplace=True)

In [21]:
# replace IN_SERVICE with 0 and CLOSED with 1 
df_calendari_festius['festius'].replace(
    to_replace=['No', 'Yes'],                       
    value=[0, 1], inplace=True)

df_calendari_festius['festius_sun'].replace(
    to_replace=['No', 'Yes'],                       
    value=[0, 1], inplace=True)

df_calendari_festius['festius_sun_sat'].replace(
    to_replace=['No', 'Yes'],                       
    value=[0, 1], inplace=True)


In [22]:
df_calendari_festius.loc[df_calendari_festius.dayofweek.isin([5,6]), 'festius_sun_sat'] = 1
df_calendari_festius.loc[df_calendari_festius.dayofweek.isin([6]), 'festius_sun'] = 1

In [23]:
df_calendari_festius['weekend'] = df_calendari_festius.dayofweek.isin([5,6]).astype(int)

In [24]:
df_calendari_festius[df_calendari_festius.year.isin([2019])].shape

(365, 8)

In [25]:
df_calendari_festius.loc[(df_calendari_festius.festius == 1) & (df_calendari_festius.festius_sun != 1), 'festius'] = 0 

In [26]:
df_calendari_festius.loc[(df_calendari_festius.month == 1) & (df_calendari_festius.day.isin([1,6])), ['festius','festius_sun','festius_sun_sat']] = 1

In [27]:
df_calendari_festius[df_calendari_festius.festius.isin([1]) & df_calendari_festius.year.isin([2019])]

Unnamed: 0,festius,festius_sun,festius_sun_sat,year,month,day,dayofweek,weekend
0,1,1,1,2019,1,1,1,0
5,1,1,1,2019,1,6,6,1
108,1,1,1,2019,4,19,4,0
111,1,1,1,2019,4,22,0,0
120,1,1,1,2019,5,1,2,0
160,1,1,1,2019,6,10,0,0
174,1,1,1,2019,6,24,0,0
226,1,1,1,2019,8,15,3,0
253,1,1,1,2019,9,11,2,0
266,1,1,1,2019,9,24,1,0


In [28]:
df_calendari_festius[df_calendari_festius.festius.isin([1]) & df_calendari_festius.year.isin([2022])]

Unnamed: 0,festius,festius_sun,festius_sun_sat,year,month,day,dayofweek,weekend
1096,1,1,1,2022,1,1,5,1
1101,1,1,1,2022,1,6,3,0
1200,1,1,1,2022,4,15,4,0
1203,1,1,1,2022,4,18,0,0
1252,1,1,1,2022,6,6,0,0
1270,1,1,1,2022,6,24,4,0
1322,1,1,1,2022,8,15,0,0
1362,1,1,1,2022,9,24,5,1
1364,1,1,1,2022,9,26,0,0
1380,1,1,1,2022,10,12,2,0


In [29]:
df_calendari_festius[df_calendari_festius.festius.isin([1]) & df_calendari_festius.year.isin([2023])]

Unnamed: 0,festius,festius_sun,festius_sun_sat,year,month,day,dayofweek,weekend
1461,1,1,1,2023,1,1,6,1
1466,1,1,1,2023,1,6,4,0
1557,1,1,1,2023,4,7,4,0
1560,1,1,1,2023,4,10,0,0


In [30]:
df_calendari_festius.drop('dayofweek', axis=1, inplace=True)

## Meteo 2019 - 2023 

In [31]:
client.run(trim_memory)

{'tcp://127.0.0.1:37539': 1,
 'tcp://127.0.0.1:39201': 1,
 'tcp://127.0.0.1:39909': 1,
 'tcp://127.0.0.1:41049': 1,
 'tcp://127.0.0.1:43489': 1}

In [32]:
df_meteo_metadata = pd.read_csv('./Altres_dades/MetadataMeteo/MeteoCat_Metadades.csv',sep=',')

In [33]:
df_meteo_metadata

Unnamed: 0,CODI_VARIABLE,NOM_VARIABLE,UNITAT,ACRÒNIM
0,1000,Temperatura mitjana diària,°C,TM
1,1001,Temperatura màxima diària + hora,°C,TX
2,1002,Temperatura mínima diària + hora,°C,TN
3,1100,Humitat relativa mitjana diària,%,HRM
4,1101,Humitat relativa màxima diària + data,%,HRX
5,1102,Humitat relativa mínima diària + data,%,HRN
6,1200,Pressió atmosfèrica mitjana diària,hPa,PM
7,1201,Pressió atmosfèrica màxima diària + hora,hPa,PX
8,1202,Pressió atmosfèrica mínima diària + hora,hPa,PN
9,1300,Precipitació acumulada diària,mm,PPT


In [34]:
df_estadistica_meteo_19_d5 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2019_d5_observatori_fabra.csv',sep=',')
df_estadistica_meteo_19_x2 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2019_x2_barcelona_zoo.csv',sep=',')
df_estadistica_meteo_19_x4 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2019_x4_barcelona_el_raval.csv',sep=',')
df_estadistica_meteo_19_x8 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2019_x8_barcelona_zona_universitaria.csv',sep=',')

In [35]:
def convert_columns(data):
    print(data.shape)
    
    data.rename(
        columns={
            'PPT24H':'PPT',
            'DVX10':'DVVX10',
            'RS24H':'RS24h',
            'HPA':'PM'
        },
        inplace=True
    )
    
    for acro in df_meteo_metadata['ACRÒNIM'].values:
        if acro not in data.columns.values:
            data[acro] = np.nan
    
    code = data.CODI_ESTACIO.unique().tolist().pop()
    
    data.drop('CODI_ESTACIO', axis=1, inplace=True)

    data = data.set_index('DATA_LECTURA')

    data.columns = [f'VALOR_{col}_{code}' for col in data.columns.values]

    data = data.reset_index()

    data['DATA_LECTURA'] = pd.to_datetime(data['DATA_LECTURA'], format='%d/%m/%Y')
#     data['year'] = data['DATA_LECTURA'].dt.year
    
    print(data.shape)
    
    return data

In [36]:
df_estadistica_meteo_19_d5 = convert_columns(df_estadistica_meteo_19_d5)

df_estadistica_meteo_19_x2 = convert_columns(df_estadistica_meteo_19_x2)

df_estadistica_meteo_19_x4 = convert_columns(df_estadistica_meteo_19_x4)

df_estadistica_meteo_19_x8 = convert_columns(df_estadistica_meteo_19_x8)

(8766, 13)
(8766, 16)
(4832, 6)
(4832, 16)
(4809, 13)
(4809, 16)
(4262, 13)
(4262, 16)


In [37]:
df_estadistica_meteo_19 = pd.merge(
    pd.merge(
        df_estadistica_meteo_19_d5, df_estadistica_meteo_19_x2, 
        on='DATA_LECTURA',
        how='outer',
    ),
    pd.merge(
        df_estadistica_meteo_19_x4, df_estadistica_meteo_19_x8, 
        on='DATA_LECTURA',
        how='outer',
    ), 
    on='DATA_LECTURA',
    how='outer',
)


In [38]:
df_estadistica_meteo_19['year'] = df_estadistica_meteo_19['DATA_LECTURA'].dt.year

df_estadistica_meteo_19 = df_estadistica_meteo_19[df_estadistica_meteo_19.year.isin([2019])]

In [39]:
df_estadistica_meteo_20 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2020_MeteoCat_Detall_Estacions.csv',sep=',')
df_estadistica_meteo_21 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2021_MeteoCat_Detall_Estacions.csv',sep=',')
df_estadistica_meteo_22 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2022_MeteoCat_Detall_Estacions.csv',sep=',')
df_estadistica_meteo_23 = pd.read_csv('./Altres_dades/EstadísticsEstacionsMeteo/2023_MeteoCat_Detall_Estacions.csv',sep=',')


In [40]:
%%time 

df_estadistica_meteo_20.drop('DATA_EXTREM', axis=1, inplace=True)
print(df_estadistica_meteo_20.shape)

df_estadistica_meteo_21.drop('DATA_EXTREM', axis=1, inplace=True)
print(df_estadistica_meteo_21.shape)

df_estadistica_meteo_22.drop('DATA_EXTREM', axis=1, inplace=True)
print(df_estadistica_meteo_22.shape)

df_estadistica_meteo_23.drop('DATA_EXTREM', axis=1, inplace=True)
print(df_estadistica_meteo_23.shape)

df_20_21_22_23 = pd.concat([df_estadistica_meteo_20, df_estadistica_meteo_21, df_estadistica_meteo_22, df_estadistica_meteo_23])
print(df_20_21_22_23.shape)
#change indexing 
df_20_21_22_23 = pd.DataFrame(df_20_21_22_23.set_index(['ACRÒNIM', 'DATA_LECTURA', 'CODI_ESTACIO']).unstack(['ACRÒNIM']))
df_20_21_22_23 = df_20_21_22_23.reindex()
df_20_21_22_23 = df_20_21_22_23.unstack()
df_20_21_22_23.columns = ['_'.join(col) for col in df_20_21_22_23.columns.values]
df_20_21_22_23 = df_20_21_22_23.reset_index()

df_20_21_22_23['DATA_LECTURA'] = pd.to_datetime(df_20_21_22_23['DATA_LECTURA'], format='%Y-%m-%d')
df_20_21_22_23['year'] = df_20_21_22_23['DATA_LECTURA'].dt.year

# checking duplicates 
print('is there duplicates: ', (df_20_21_22_23.groupby('DATA_LECTURA').count().reset_index().VALOR_DVM10_D5 > 1 ).any())

print(df_20_21_22_23.shape)

(18637, 4)
(18615, 4)
(18605, 4)
(8007, 4)
(63864, 4)
is there duplicates:  False
(1253, 62)
CPU times: user 30.5 ms, sys: 0 ns, total: 30.5 ms
Wall time: 29.9 ms


In [41]:
df_estadistica_meteo = pd.concat([df_estadistica_meteo_19, df_20_21_22_23])

In [42]:
df_estadistica_meteo

Unnamed: 0,DATA_LECTURA,VALOR_TM_D5,VALOR_TX_D5,VALOR_TN_D5,VALOR_HRM_D5,VALOR_PPT_D5,VALOR_PM_D5,VALOR_RS24h_D5,VALOR_VVM10_D5,VALOR_DVM10_D5,VALOR_VVX10_D5,VALOR_DVVX10_D5,VALOR_HRX_D5,VALOR_HRN_D5,VALOR_PX_D5,VALOR_PN_D5,VALOR_TM_X2,VALOR_TX_X2,VALOR_TN_X2,VALOR_HRM_X2,VALOR_HRX_X2,VALOR_HRN_X2,VALOR_PM_X2,VALOR_PX_X2,VALOR_PN_X2,VALOR_PPT_X2,VALOR_RS24h_X2,VALOR_VVM10_X2,VALOR_DVM10_X2,VALOR_VVX10_X2,VALOR_DVVX10_X2,VALOR_TM_X4,VALOR_TX_X4,VALOR_TN_X4,VALOR_HRM_X4,VALOR_PPT_X4,VALOR_PM_X4,VALOR_RS24h_X4,VALOR_VVM10_X4,VALOR_DVM10_X4,VALOR_VVX10_X4,VALOR_DVVX10_X4,VALOR_HRX_X4,VALOR_HRN_X4,VALOR_PX_X4,VALOR_PN_X4,VALOR_TM_X8,VALOR_TX_X8,VALOR_TN_X8,VALOR_HRM_X8,VALOR_PPT_X8,VALOR_PM_X8,VALOR_RS24h_X8,VALOR_VVM10_X8,VALOR_DVM10_X8,VALOR_VVX10_X8,VALOR_DVVX10_X8,VALOR_HRX_X8,VALOR_HRN_X8,VALOR_PX_X8,VALOR_PN_X8,year
8401,2019-01-01,10.5,14.4,7.8,73.0,0.0,979.6,5.4,3.8,339.0,15.8,315.0,,,,,9.9,15.4,4.9,79.0,,,,,,,,,,,,12.3,16.0,8.4,66.0,0.0,1024.5,8.2,1.3,307.0,7.9,301.0,,,,,11.2,17.0,7.0,71.0,0.0,1019.0,7.4,1.4,311.0,9.0,303.0,,,,,2019
8402,2019-01-02,8.9,12.9,5.9,48.0,0.0,976.5,9.4,8.3,324.0,17.3,318.0,,,,,8.8,14.5,5.1,63.0,,,,,,,,,,,,11.2,16.3,8.1,47.0,0.0,1021.4,9.2,1.0,280.0,6.3,309.0,,,,,9.5,15.3,5.7,52.0,0.0,1015.9,9.1,1.7,293.0,10.2,304.0,,,,,2019
8403,2019-01-03,8.3,12.6,4.9,59.0,0.0,979.0,9.1,4.0,279.0,9.9,313.0,,,,,7.6,13.0,4.1,69.0,,,,,,,,,,,,10.3,13.7,7.5,53.0,0.0,1024.3,8.9,1.1,264.0,4.6,263.0,,,,,8.8,13.4,4.4,59.0,0.0,1018.7,8.8,1.3,277.0,6.0,306.0,,,,,2019
8404,2019-01-04,7.2,11.3,5.3,69.0,0.0,980.6,8.0,2.0,205.0,7.8,290.0,,,,,7.0,11.7,3.3,73.0,,,,,,,,,,,,9.4,12.3,6.4,59.0,0.0,1026.1,7.9,0.9,94.0,4.8,80.0,,,,,7.0,12.6,2.6,70.0,0.0,1020.5,7.7,1.2,292.0,4.1,164.0,,,,,2019
8405,2019-01-05,9.1,15.2,3.2,47.0,0.0,978.6,9.3,10.0,326.0,17.0,317.0,,,,,8.5,15.3,4.4,63.0,,,,,,,,,,,,10.9,16.3,6.8,45.0,0.0,1023.6,9.2,2.0,282.0,7.5,267.0,,,,,9.4,14.1,5.4,50.0,0.0,1018.1,9.2,3.1,296.0,10.8,307.0,,,,,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,2023-06-02,18.9,24.7,15.4,80.0,0.0,966.5,24.3,2.0,202.0,7.1,157.0,100.0,56.0,967.5,965.2,20.1,24.1,16.9,79.0,93.0,62.0,,,,,,,,,,20.8,23.2,18.2,74.0,0.0,1010.0,24.9,1.7,159.0,5.9,152.0,92.0,55.0,1011.0,1008.6,20.6,25.2,15.8,77.0,0.0,1004.4,23.5,1.5,207.0,6.1,174.0,99.0,54.0,1005.4,1003.0,2023
1249,2023-06-03,18.2,22.8,15.3,86.0,0.1,966.9,15.4,1.8,190.0,6.9,151.0,100.0,65.0,968.0,965.4,20.3,22.7,17.6,81.0,91.0,66.0,,,,,,,,,,20.8,22.8,18.9,76.0,0.0,1010.5,18.2,1.6,149.0,6.6,174.0,91.0,59.0,1011.7,1009.0,20.5,25.3,16.9,78.0,0.0,1004.8,16.2,1.4,170.0,6.9,181.0,100.0,58.0,1006.1,1003.4,2023
1250,2023-06-04,19.1,25.7,16.2,82.0,0.0,967.8,21.6,1.8,184.0,7.0,167.0,100.0,51.0,969.0,966.6,20.5,24.5,17.9,81.0,91.0,69.0,,,,,,,,,,21.0,23.9,18.9,76.0,0.0,1011.4,23.8,1.6,127.0,6.3,101.0,93.0,62.0,1012.6,1010.2,20.8,25.9,17.2,77.0,0.0,1005.7,22.5,1.4,176.0,6.3,132.0,99.0,51.0,1007.0,1004.6,2023
1251,2023-06-05,19.9,25.5,15.9,75.0,0.0,968.0,26.1,3.3,239.0,8.9,191.0,99.0,52.0,968.8,966.9,21.5,25.7,17.3,75.0,91.0,62.0,,,,,,,,,,22.0,24.6,18.9,67.0,0.0,1011.5,28.6,1.8,193.0,7.7,155.0,80.0,56.0,1012.4,1010.0,21.8,26.5,16.7,70.0,0.0,1005.9,27.7,1.7,220.0,7.2,209.0,95.0,51.0,1006.8,1004.7,2023


In [43]:
df_estadistica_meteo['month'] = df_estadistica_meteo['DATA_LECTURA'].dt.month
df_estadistica_meteo['day'] = df_estadistica_meteo['DATA_LECTURA'].dt.day

In [44]:
df_estadistica_meteo.drop('DATA_LECTURA', axis=1, inplace=True)

In [45]:
%%time

nans = get_features_nans(df_estadistica_meteo)

zeros = get_features_zero(df_estadistica_meteo)

CPU times: user 11.7 ms, sys: 0 ns, total: 11.7 ms
Wall time: 10.7 ms


In [46]:
nans

{'VALOR_TM_D5': 0.061804697156983925,
 'VALOR_HRM_D5': 0.061804697156983925,
 'VALOR_RS24h_D5': 0.4326328800988875,
 'VALOR_VVM10_D5': 0.3708281829419036,
 'VALOR_DVM10_D5': 0.6180469715698393,
 'VALOR_VVX10_D5': 0.4326328800988875,
 'VALOR_DVVX10_D5': 0.30902348578491967,
 'VALOR_HRX_D5': 22.558714462299136,
 'VALOR_HRN_D5': 22.558714462299136,
 'VALOR_PX_D5': 22.558714462299136,
 'VALOR_PN_D5': 22.558714462299136,
 'VALOR_HRX_X2': 22.558714462299136,
 'VALOR_HRN_X2': 22.558714462299136,
 'VALOR_PM_X2': 100.0,
 'VALOR_PX_X2': 100.0,
 'VALOR_PN_X2': 100.0,
 'VALOR_PPT_X2': 100.0,
 'VALOR_RS24h_X2': 100.0,
 'VALOR_VVM10_X2': 100.0,
 'VALOR_DVM10_X2': 100.0,
 'VALOR_VVX10_X2': 100.0,
 'VALOR_DVVX10_X2': 100.0,
 'VALOR_VVM10_X4': 0.061804697156983925,
 'VALOR_DVM10_X4': 0.061804697156983925,
 'VALOR_HRX_X4': 22.558714462299136,
 'VALOR_HRN_X4': 22.558714462299136,
 'VALOR_PX_X4': 22.558714462299136,
 'VALOR_PN_X4': 22.558714462299136,
 'VALOR_HRX_X8': 22.558714462299136,
 'VALOR_HRN_X8': 

In [47]:
zeros

{'VALOR_PPT_D5': 77.07045735475896,
 'VALOR_DVM10_D5': 0.12360939431396785,
 'VALOR_DVVX10_D5': 0.30902348578491967,
 'VALOR_PPT_X4': 79.41903584672436,
 'VALOR_DVM10_X4': 0.061804697156983925,
 'VALOR_PPT_X8': 77.99752781211372}

In [48]:
nans = pd.Series(nans)

nancolumns = nans[nans >= 90].index.values
nancolumns

array(['VALOR_PM_X2', 'VALOR_PX_X2', 'VALOR_PN_X2', 'VALOR_PPT_X2',
       'VALOR_RS24h_X2', 'VALOR_VVM10_X2', 'VALOR_DVM10_X2',
       'VALOR_VVX10_X2', 'VALOR_DVVX10_X2'], dtype=object)

In [49]:
df_estadistica_meteo.drop(columns=nancolumns,axis=1, inplace=True)

In [50]:
df_estadistica_meteo.columns

Index(['VALOR_TM_D5', 'VALOR_TX_D5', 'VALOR_TN_D5', 'VALOR_HRM_D5',
       'VALOR_PPT_D5', 'VALOR_PM_D5', 'VALOR_RS24h_D5', 'VALOR_VVM10_D5',
       'VALOR_DVM10_D5', 'VALOR_VVX10_D5', 'VALOR_DVVX10_D5', 'VALOR_HRX_D5',
       'VALOR_HRN_D5', 'VALOR_PX_D5', 'VALOR_PN_D5', 'VALOR_TM_X2',
       'VALOR_TX_X2', 'VALOR_TN_X2', 'VALOR_HRM_X2', 'VALOR_HRX_X2',
       'VALOR_HRN_X2', 'VALOR_TM_X4', 'VALOR_TX_X4', 'VALOR_TN_X4',
       'VALOR_HRM_X4', 'VALOR_PPT_X4', 'VALOR_PM_X4', 'VALOR_RS24h_X4',
       'VALOR_VVM10_X4', 'VALOR_DVM10_X4', 'VALOR_VVX10_X4', 'VALOR_DVVX10_X4',
       'VALOR_HRX_X4', 'VALOR_HRN_X4', 'VALOR_PX_X4', 'VALOR_PN_X4',
       'VALOR_TM_X8', 'VALOR_TX_X8', 'VALOR_TN_X8', 'VALOR_HRM_X8',
       'VALOR_PPT_X8', 'VALOR_PM_X8', 'VALOR_RS24h_X8', 'VALOR_VVM10_X8',
       'VALOR_DVM10_X8', 'VALOR_VVX10_X8', 'VALOR_DVVX10_X8', 'VALOR_HRX_X8',
       'VALOR_HRN_X8', 'VALOR_PX_X8', 'VALOR_PN_X8', 'year', 'month', 'day'],
      dtype='object')

# Split estaciones 2019 - 2022

## Part 1 

In [51]:
client.run(trim_memory)

{'tcp://127.0.0.1:37539': 1,
 'tcp://127.0.0.1:39201': 1,
 'tcp://127.0.0.1:39909': 1,
 'tcp://127.0.0.1:41049': 1,
 'tcp://127.0.0.1:43489': 1}

In [52]:
%%time

bbdd_completa_19_22 = bbdd_completa[bbdd_completa.year.isin([2019, 2020, 2021, 2022])]

CPU times: user 1.49 ms, sys: 0 ns, total: 1.49 ms
Wall time: 1.47 ms


In [53]:
%%time

unique_ids_by_year = bbdd_completa_19_22.groupby('year')['station_id'].unique().compute()
print(unique_ids_by_year)

year
2019    [1, 320, 319, 318, 317, 316, 315, 314, 313, 31...
2020    [1, 301, 300, 299, 14, 298, 297, 296, 295, 294...
2021    [1, 271, 400, 510, 26, 270, 269, 401, 268, 267...
2022    [1, 380, 28, 379, 378, 377, 376, 375, 29, 374,...
Name: station_id, dtype: object
CPU times: user 3.12 s, sys: 244 ms, total: 3.36 s
Wall time: 15.8 s


In [54]:
# Calcula los station_id comunes en todos los años
common_ids = set(unique_ids_by_year[2019])
for year, ids in unique_ids_by_year.items():
    print(year, ids.shape)
    common_ids = common_ids.intersection(set(ids))
# common_ids
print('total:',len(common_ids))

2019 (410,)
2020 (508,)
2021 (509,)
2022 (510,)
total: 408


In [55]:
filtered_bbdd = bbdd_completa_19_22[bbdd_completa_19_22['station_id'].isin(common_ids)]
nonfiltered_bbdd = bbdd_completa_19_22[~bbdd_completa_19_22['station_id'].isin(common_ids)]

In [56]:
bbdd_completa_19_22 = None

## Part 2 

In [57]:
client.run(trim_memory)

{'tcp://127.0.0.1:37539': 1,
 'tcp://127.0.0.1:39201': 1,
 'tcp://127.0.0.1:39909': 1,
 'tcp://127.0.0.1:41049': 1,
 'tcp://127.0.0.1:43489': 1}

2019

In [58]:
train_data2019 = filtered_bbdd[filtered_bbdd.year.isin([2019])]
get_ddf_shape(train_data2019)

(3023314, 13)

In [59]:
val_data2019 = nonfiltered_bbdd[nonfiltered_bbdd.year.isin([2019])]
get_ddf_shape(val_data2019)

(14882, 13)

2020

In [60]:
train_data2020 = filtered_bbdd[filtered_bbdd.year.isin([2020])]
get_ddf_shape(train_data2020)

(3048728, 13)

In [61]:
val_data2020 = nonfiltered_bbdd[nonfiltered_bbdd.year.isin([2020])]
get_ddf_shape(val_data2020)

(499114, 13)

2021

In [62]:
train_data2021 = filtered_bbdd[filtered_bbdd.year.isin([2021])]
get_ddf_shape(train_data2021)

(3531621, 13)

In [63]:
val_data2021 = nonfiltered_bbdd[nonfiltered_bbdd.year.isin([2021])]
get_ddf_shape(val_data2021)

(871184, 13)

2021

In [64]:
train_data2021 = filtered_bbdd[filtered_bbdd.year.isin([2021])]
get_ddf_shape(train_data2021)

(3531621, 13)

In [65]:
val_data2022 = nonfiltered_bbdd[nonfiltered_bbdd.year.isin([2022])]
get_ddf_shape(val_data2022)

(882341, 13)

2023

In [66]:
val_data2023 = bbdd_completa[bbdd_completa.year.isin([2023])]
get_ddf_shape(val_data2023)

(1692805, 13)

# Convert to Pandas 

2019

In [67]:
%%time 

# conversion a pandas
# datos de training de 2019
X_train2019, y_train2019 = train_data2019.compute().copy(), train_data2019.ctx0.compute().copy()

CPU times: user 6.8 s, sys: 956 ms, total: 7.76 s
Wall time: 34.9 s


In [68]:
%%time 

# conversion a pandas
# datos de validacion de 2019
X_val2019, y_val2019 = val_data2019.compute().copy(), val_data2019.ctx0.compute().copy()

CPU times: user 6.16 s, sys: 568 ms, total: 6.73 s
Wall time: 31.7 s


2020

In [69]:
%%time 

# conversion a pandas
# datos de training de 2020
X_train2020, y_train2020 = train_data2020.compute().copy(), train_data2020.ctx0.compute().copy()

CPU times: user 6.7 s, sys: 887 ms, total: 7.59 s
Wall time: 34.2 s


In [70]:
%%time 

# conversion a pandas
# datos de validacion de 2020
X_val2020, y_val2020 = val_data2020.compute().copy(), val_data2020.ctx0.compute().copy()

CPU times: user 6.29 s, sys: 569 ms, total: 6.85 s
Wall time: 31.9 s


2021

In [71]:
%%time 

# conversion a pandas
# datos de training de 2021
X_train2021, y_train2021 = train_data2021.compute().copy(), train_data2021.ctx0.compute().copy()



CPU times: user 9.17 s, sys: 1.55 s, total: 10.7 s
Wall time: 49.9 s


In [72]:
%%time 

# conversion a pandas
# datos de validacion de 2021
X_val2021, y_val2021 = val_data2021.compute().copy(), val_data2021.ctx0.compute().copy()



CPU times: user 6.74 s, sys: 600 ms, total: 7.34 s
Wall time: 34 s


2022

In [73]:
%%time 

# conversion a pandas
# datos de training de 2022
X_train2022, y_train2022 = train_data2022.compute().copy(), train_data2022.ctx0.compute().copy()

NameError: name 'train_data2022' is not defined

In [74]:
%%time 

# conversion a pandas
# datos de validacion de 2022
X_val2022, y_val2022 = val_data2022.compute().copy(), val_data2022.ctx0.compute().copy()

CPU times: user 6.49 s, sys: 614 ms, total: 7.1 s
Wall time: 32.6 s


2023

In [75]:
%%time 

# conversion a pandas
# datos de validacion de 2023
X_test, y_test = val_data2023.compute().copy(), val_data2023.ctx0.compute().copy()

CPU times: user 6.73 s, sys: 670 ms, total: 7.4 s
Wall time: 30.7 s


# Shutdown DASK dist client

In [76]:
client.run(trim_memory)

{'tcp://127.0.0.1:33421': 1,
 'tcp://127.0.0.1:37539': 1,
 'tcp://127.0.0.1:39909': 1,
 'tcp://127.0.0.1:41049': 1,
 'tcp://127.0.0.1:43489': 1}

In [77]:
client.shutdown()

# Merge additional data

### Merge with data festius

In [None]:

X_train = pd.merge(X_train, df_calendari_festius, on=['year', 'month', 'day'], how='left')
X_val = pd.merge(X_val, df_calendari_festius, on=['year', 'month', 'day'], how='left')
X_test = pd.merge(X_test, df_calendari_festius, on=['year', 'month', 'day'], how='left')


### Merge with data meteo

In [None]:

X_train = pd.merge(X_train, df_estadistica_meteo, on=['year', 'month', 'day'], how='left')
X_val = pd.merge(X_val, df_estadistica_meteo, on=['year', 'month', 'day'], how='left')
X_test = pd.merge(X_test, df_estadistica_meteo, on=['year', 'month', 'day'], how='left')


# Final Cleaning & preprocessing

In [None]:
X_train.columns

In [None]:
X_test.festius.fillna(0, inplace=True)
X_test.festius_sun.fillna(0, inplace=True)
X_test.festius_sun_sat.fillna(0, inplace=True)
X_test.weekend.fillna(0, inplace=True)

X_test.loc[X_test.dayofweek.isin([5,6]),'weekend'] = 1
X_test.loc[X_test.dayofweek.isin([5,6]),'festius_sun_sat'] = 1
X_test.loc[X_test.dayofweek.isin([6]),'festius_sun'] = 1


In [None]:
%%time

nans = get_features_nans(X_test)

zeros = get_features_zero(X_test)

In [None]:
nans

In [None]:
zeros

In [None]:
%%time

nans = get_features_nans(pd.concat([X_train, X_val, X_test]))

zeros = get_features_zero(pd.concat([X_train, X_val, X_test]))

In [None]:
nans

In [None]:
zeros

## Feature extraction

In [None]:
columns = ['ctx1', 'ctx2', 'ctx3', 'ctx4']
combination = [columns[:i] for i in range(2,len(columns)+1)]

for comb in combination:
    suffix = '_'.join(comb)
    print(suffix)
    X_train[f'{suffix}_mean'] = X_train[comb].mean(axis=1)
    X_train[f'{suffix}_std'] = X_train[comb].std(axis=1)
    
    X_val[f'{suffix}_mean'] = X_val[comb].mean(axis=1)
    X_val[f'{suffix}_std'] = X_val[comb].std(axis=1)
    
    X_test[f'{suffix}_mean'] = X_test[comb].mean(axis=1)
    X_test[f'{suffix}_std'] = X_test[comb].std(axis=1)
    

# Save prepared data to a file 

In [None]:
%%time 

X_train.to_csv('dades/processed/training_data.csv', header=True, index=True, index_label='index')
y_train.to_csv('dades/processed/training_label.csv', header=True, index=True, index_label='index')

In [None]:
%%time 

X_val.to_csv('dades/processed/validation_data.csv', header=True, index=True, index_label='index')
y_val.to_csv('dades/processed/validation_label.csv', header=True, index=True, index_label='index')

In [None]:
%%time 

X_test.to_csv('dades/processed/testing_data.csv', header=True, index=True, index_label='index')
X_test.to_csv('dades/processed/testing_label.csv', header=True, index=True, index_label='index')

In [None]:
years = X_train.year.unique().tolist()
years

In [None]:
months = X_train.month.unique().tolist()
months

# Run pipeline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn import neighbors
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import validation_curve

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

In [None]:
X_train.columns

In [None]:
columns = X_train.columns[['VALOR' not in i for i in X_train.columns]].to_list()

In [None]:
%%time 

correlations = np.abs(
    X_train[columns].corr(method='pearson').ctx0
).sort_values(ascending=False)

correlations.to_dict()

In [None]:
columns_ctx = X_train.columns[['ctx' in i for i in X_train.columns]].to_list()
columns_ctx.remove('ctx0')

In [None]:
columns_meteo = ['VALOR_TN_X4','VALOR_TM_X4', 'VALOR_TX_X4', 'VALOR_PPT_X4']

In [None]:
class Config:
    num_attribs0 = ['capacity'] + columns_ctx + columns_meteo
    cat_attribs0 = ['month', 'dayofyear', 'hour', 'dayofweek', 'day'] 
    cat_attribs1 = ['']
    gen_attribs0 = ['station_id', 'festius_sun', 'weekend']
    target_col = ['ctx0']
    
    seed=42
    
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    

In [None]:
config=Config()
seed_everything(config.seed)

In [None]:

def build_preprocessor(config):
    num_attribs0 = config.num_attribs0
    cat_attribs0 = config.cat_attribs0
    cat_attribs1 = config.cat_attribs1
    gen_attribs0 = config.gen_attribs0

    num_transformer0 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ('std_scaler', (StandardScaler())),
    ])

    categorical_transformer0 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
        ('ordinal_encoder', (OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan))),
    ])
    
    categorical_transformer1 = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
        ('one_hot_encoder', (OneHotEncoder(handle_unknown='ignore'))),
    ])
    
    generic_transformer0 = Pipeline([
        ("imputer", SimpleImputer(strategy="constant",fill_value=0)),
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("num0", num_transformer0, num_attribs0),
            ("gen1", generic_transformer0, gen_attribs0),
            ("cat0", categorical_transformer0, cat_attribs0),
#             ("cat1", categorical_transformer1, cat_attribs1),
        ],
        remainder="drop"
    )
    
    return preprocessor

In [None]:
%%time 

full_pipeline = build_preprocessor(config)

In [None]:
full_pipeline.fit(X_train)

In [None]:
def apply_pipeline(pipline, X, y, args=None, show=True):
    assert X.shape[0] == y.shape[0]
    
    X_prepared = full_pipeline.transform(X)
    
    if show:
        print("X", X.shape, 
              "X_prepared:", X_prepared.shape,
              "y: ", y.shape
             )
        
    if args:
        return X_prepared, y, *args
    else:
        return X_prepared, y

In [None]:
def prepare_splits(
    pipeline,
    data_train,
    data_val,
    data_test,
    years, 
    months, 
    show=True
):
    if show:
        print("train")
    
    train_cond = (data_train[0].year.isin(years) & data_train[0].month.isin(months))
    Xtr, ytr = apply_pipeline(
        pipeline,
        data_train[0][train_cond], 
        data_train[1][train_cond],
        show=show
    )

    if show:
        print("val")
    
    val_cond = (data_val[0].year.isin(years) & data_val[0].month.isin(months))
    Xva, yva = apply_pipeline(
        pipeline, 
        data_val[0][val_cond], 
        data_val[1][val_cond],
        show=show
    )

    if show:
        print("test")
    
    test_cond = (data_test[0].year.isin([2023]) & data_test[0].month.isin([3]))
    Xte, yte = apply_pipeline(
        pipeline, 
        data_test[0][test_cond],
        data_test[1][test_cond],
        show=show
    )
    
    return Xtr, ytr, Xva, yva, Xte, yte

In [None]:
def test_model_train_val_test(
    model,
    data_train,
    data_val,
    data_test,
    skip_cv=False,
    show=False,
    error_score='raise'
):
    if skip_cv==False:
        cross_val_evaluation(
            model, 
            data_train[0], 
            data_train[1], 
            'model test', 
            n_jobs=5,
            error_score=error_score
        )
    
    model.fit(data_train[0], data_train[1])
    
    return {
        'train':test_model(model, data_train[0], data_train[1], show=show)[['mse_t','rmse_t','mae_t','r2_t']],
        'test':test_model(model, data_val[0], data_val[1], data_test[0], data_test[1], show=show)[['mse_t','rmse_t','mae_t','r2_t', 'mse_v','rmse_v','mae_v','r2_v']]
    }


In [None]:
def do_test_per_month(
    model, 
    modelargs,
    data_train,
    data_val,
    data_test,
    years,
    months
):
    result = {}
    with tqdm(range (len(years)*len(months)), unit="months", mininterval=0, disable=False) as bar:
        for index in bar:
            month = months[(index%len(months))]
            year = years[int(index>=len(months))]
            bar.set_description(f"Month {year}-{month}")

            Xtr, ytr, Xva, yva, Xte, yte = prepare_splits(
                full_pipeline,
                data_train,
                data_val,
                data_test,
                [year], [month], False)
            
            # define model 
            m = model(**modelargs)

            result[index] = test_model_train_val_test(
                m,
                (Xtr, ytr),
                (Xva, yva),
                (Xte, yte),
                skip_cv=True,
                show=False,
                error_score='raise'
            )
                
    result=pd.DataFrame(result)
    aux=pd.DataFrame(result.T.unstack().reset_index()[0].to_dict()).T
    aux['type']=pd.DataFrame(result.T.unstack().reset_index().level_0)
    aux['order']=pd.DataFrame(result.T.unstack().reset_index().level_1)
    aux = pd.DataFrame(aux.set_index(['type', 'order']).unstack(['type']).reindex()).reset_index()
    aux.columns = ['_'.join(col) for col in aux.columns.values]
    
    return aux

In [None]:
def do_test_per_month_combos(
    model, 
    modelargs,
    data_train,
    data_val,
    data_test,
    years,
    months
):
    result = {}
    with tqdm(range (len(years)*len(months)), unit="months", mininterval=0, disable=False) as bar:
        for index in bar:
            month_combo = months[(index%len(months))]
            year = years[int(index>=len(months))]
            bar.set_description(f"Month {year}-{month_combo}")

            Xtr, ytr, Xva, yva, Xte, yte = prepare_splits(
                full_pipeline,
                data_train,
                data_val,
                data_test,
                [year], month_combo, False)
            
            # define model 
            m = model(**modelargs)

            result[index] = test_model_train_val_test(
                m,
                (Xtr, ytr),
                (Xva, yva),
                (Xte, yte),
                skip_cv=True,
                show=False,
                error_score='raise'
            )
            
    result=pd.DataFrame(result)
    aux=pd.DataFrame(result.T.unstack().reset_index()[0].to_dict()).T
    aux['type']=pd.DataFrame(result.T.unstack().reset_index().level_0)
    aux['order']=pd.DataFrame(result.T.unstack().reset_index().level_1)
    aux = pd.DataFrame(aux.set_index(['type', 'order']).unstack(['type']).reindex()).reset_index()
    aux.columns = ['_'.join(col) for col in aux.columns.values]
    
    return aux

In [None]:
[months.remove(x) for x in [6,7,8,9,10,11]]

In [None]:
years, months

In [None]:
X_train.shape,y_train.shape

In [None]:
X_val.shape, y_val.shape

In [None]:
# Take data of 2022 

X_train_prepared, y_train_prepared, X_val_prepared, y_val_prepared, X_test_prepared, y_test_prepared = prepare_splits(
    full_pipeline,
    (X_train, y_train),
    (X_val, y_val),
    (X_test, y_test),
    [2022], months, True)


In [None]:
import gc

gc.collect()

# Predicción______________________________

a) Regresión lineal: relación lineal entre las variables de entrada y la variable de salida. 

b) Regresión Redes Neuronales (RNN -redes neuronales recurrentes-): pueden capturar relaciones no lineales entre las variables de entrada y salida.

## LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [None]:
lin_reg = LinearRegression(n_jobs=5)

In [None]:
ridge_reg = Ridge(alpha=.3, random_state=config.seed)

In [None]:
lasso_reg = Lasso(alpha=0.01, random_state=config.seed)

In [None]:
elasticNet_reg = ElasticNet(alpha=0.01, l1_ratio=0.3, random_state=config.seed) 

## Train and test, validate model

In [None]:
%%time 

test_model_train_val_test(
    lin_reg, 
    (X_train_prepared, y_train_prepared),
    (X_val_prepared, y_val_prepared),
    (X_test_prepared,  y_test_prepared),
    show=True
)


In [None]:
%%time 

test_model_train_val_test(
    lasso_reg, 
    (X_train_prepared, y_train_prepared),
    (X_val_prepared, y_val_prepared),
    (X_test_prepared,  y_test_prepared),
    show=True
)


In [None]:
%%time 

test_model_train_val_test(
    ridge_reg, 
    (X_train_prepared, y_train_prepared),
    (X_val_prepared, y_val_prepared),
    (X_test_prepared,  y_test_prepared),
    show=True
)


In [None]:
%%time 

test_model_train_val_test(
    elasticNet_reg, 
    (X_train_prepared, y_train_prepared),
    (X_val_prepared, y_val_prepared),
    (X_test_prepared,  y_test_prepared),
    show=True
)


## RANSACRegressor

In [None]:
from sklearn.linear_model import RANSACRegressor

In [None]:
%%time 

# Set RANSAC hyperparameters
ransac = RANSACRegressor(
    LinearRegression(n_jobs=5),
    max_trials=5, # Number of Iterations
    min_samples=2, # Minimum size of the sample
    loss='absolute_error',# Metrics for loss
    residual_threshold=0.5 # Threshold
)


In [None]:
%%time 

test_model_train_val_test(
    ransac, 
    (X_train_prepared, y_train_prepared),
    (X_val_prepared, y_val_prepared),
    (X_test_prepared,  y_test_prepared),
    show=True
)

## Generate Sample

In [None]:
sample_data = pd.read_csv('dades/metadata_sample_submission.csv/metadata_sample_submission.csv')

In [None]:
sample_data['year'] = 2023

In [None]:
sample_data['datetime'] = pd.to_datetime(sample_data[['year','month','day']])
sample_data['dayofweek'] = sample_data.datetime.dt.dayofweek
sample_data['dayofyear'] = sample_data.datetime.dt.dayofyear

In [None]:
sample_data.drop(['datetime', 'index'], axis=1, inplace=True)

In [None]:
sample_data.rename(
    columns = {
        'ctx-4':'ctx4',
        'ctx-3':'ctx3',
        'ctx-2':'ctx2',
        'ctx-1':'ctx1'
    },
    inplace=True
)

In [None]:
sample_data

##  Feature extraction

In [None]:
columns = ['ctx1', 'ctx2', 'ctx3', 'ctx4']
combination = [columns[:i] for i in range(2,len(columns)+1)]

for comb in combination:
    suffix = '_'.join(comb)
    print(suffix)
    sample_data[f'{suffix}_mean'] = sample_data[comb].mean(axis=1)
    sample_data[f'{suffix}_std'] = sample_data[comb].std(axis=1)
    

# Merge additional data

### Merge with data festius

In [None]:

sample_data = pd.merge(sample_data, df_calendari_festius, on=['year', 'month', 'day'], how='left')


### Merge with data meteo

In [None]:

sample_data = pd.merge(sample_data, df_estadistica_meteo, on=['year', 'month', 'day'], how='left')


# merge capacity 

In [None]:
X_train.capacity = X_train.capacity.astype(int)

In [None]:
station_capacity = X_train.groupby(['station_id']).capacity.max()


In [None]:
station_capacity = station_capacity.reset_index()

In [None]:

sample_data = pd.merge(sample_data, station_capacity, on=['station_id'], how='left')


# check columns that does not intersect and save sample data modified to file 

In [None]:
[x for x in X_train.columns if x not in sample_data.columns]

In [None]:
%%time 

sample_data.to_csv('dades/processed/kaggle_sample_data.csv', header=True, index=True, index_label='index')


# apply pipeline

In [None]:
X_sample_prepared = full_pipeline.transform(sample_data)

print("x_train_prepared:",X_train_prepared.shape,"y_train: ",y_train_prepared.shape)
print("x_test_prepared:",X_val_prepared.shape,"y_test: ",y_val_prepared.shape)
print("x_test_prepared:",X_test_prepared.shape,"y_test: ",y_test_prepared.shape)
print("x_test_prepared:",X_sample_prepared.shape)

In [None]:
%%time 

model = forest

In [None]:
name = f'{model.__class__}'.split('.')[-1]
name = name.translate( { ord(i): None for i in "'<>"})
number = 1

In [None]:
test_model_train_val_test(
    model, 
    (X_train_prepared, y_train_prepared),
    (X_val_prepared, y_val_prepared),
    (X_test_prepared,  y_test_prepared),
    show=True
)

In [None]:
yhat = lin_reg.predict(X_test_prepared)

In [None]:
yhat.shape

In [None]:
sample_data['percentage_docks_available'] = yhat

In [None]:
sample_data['percentage_docks_available'].to_csv(f'{name}{number}.csv', header=True, index_label='index')