In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re

from tqdm.notebook import tqdm

sys.path.insert(0, '../../tools/')

from tools import * 

config = pd.Series({
    'devide_by':60,
    'year':2019,
    'datafrom': 'BICING_ESTACIONS',
    'dataset': 'BicingNou_MOD',
    'ttl': 30,
    'month': 1,
    'monthname': 'Gener',
    'path': '../../dades'
})

os.system(f"mkdir -p {config.path}/{config.year}/{config.dataset}")


0

# Data Gener 2019 

In [2]:
%%time

dades_2019_Gener_info_old = pd.read_csv(f'{config.path}/{config.year}/{config.datafrom}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom}.csv', low_memory=False)

intial_size = dades_2019_Gener_info_old.shape[0]
print(dades_2019_Gener_info_old.shape)

dades_2019_Gener_info_old.rename(
    columns={
        'id': 'station_id',
        'latitude':'lat',
        'longitude':'lon',
        'type':'physical_configuration',
        'updateTime':'last_updated',
        'bikes':'num_bikes_available',
        'slots':'num_docks_available',
        'streetName':'street_name',
        'streetNumber':'street_number',
    }, 
    inplace=True
)

# STATUS = IN_SERVICE=En servei, CLOSED=Tancada
# replace IN_SERVICE with 1 and CLOSED with 0
dades_2019_Gener_info_old['status'].replace(to_replace=['CLS', 'OPN'], value=[0, 1], inplace=True)

dades_2019_Gener_info_old['physical_configuration'].replace(to_replace=['BIKE', 'BIKE-ELECTRIC'], value=[0, 1], inplace=True)

dades_2019_Gener_info_old['is_installed'] = 1
dades_2019_Gener_info_old['is_renting'] = 1
dades_2019_Gener_info_old['is_returning'] = 1
dades_2019_Gener_info_old['is_charging_station'] = 1
dades_2019_Gener_info_old['num_bikes_available_types.mechanical'] = dades_2019_Gener_info_old['num_bikes_available']
dades_2019_Gener_info_old['num_bikes_available_types.ebike'] = 0
dades_2019_Gener_info_old['post_code'] = '0'
dades_2019_Gener_info_old['capacity'] = 0

# we don't have this column anywhere in the new dataset so it got removed
dades_2019_Gener_info_old.drop('nearbyStations', axis=1, inplace=True)

dades_2019_Gener_info_old = convert_timestamp(dades_2019_Gener_info_old.copy(), ['last_updated'], sort=True, add=True, pattern='%d/%m/%y %H:%M:%S')

# convert timestamps to multimple of 5
dades_2019_Gener_info_old = timestamp_multipleof(
    devide_by=config.devide_by, 
    column='minutes_last_updated_date',
    df=dades_2019_Gener_info_old.copy(), 
    new_column='last_updated', 
    year_column='year_last_updated_date',
    month_column='month_last_updated_date',
    day_column='dayofmonth_last_updated_date',
    hour_column='hour_last_updated_date',
    minutes_column='minutes_last_updated_date'
)    

# drop not needed columns
dades_2019_Gener_info_old.drop(
    [
        'week_last_updated_date', 'minutes_last_updated_date'
    ], 
    axis=1, 
    inplace=True
)

print(dades_2019_Gener_info_old.shape)
print('removed:', intial_size-dades_2019_Gener_info_old.shape[0])

(3979843, 12)
(3979843, 25)
removed: 0
CPU times: user 1min 21s, sys: 3.73 s, total: 1min 24s
Wall time: 1min 25s


In [3]:
%%time

# replace nans with string of 0
dades_2019_Gener_info_old.street_number = dades_2019_Gener_info_old.street_number.astype(str)
dades_2019_Gener_info_old.street_number.fillna('0', inplace=True)
dades_2019_Gener_info_old.loc[dades_2019_Gener_info_old.street_number.isin(['nan', '.']),['street_number']] = '0' 

# extract the correct column
correct_column = dades_2019_Gener_info_old[dades_2019_Gener_info_old.street_number != '0'].groupby(['station_id'])['street_number'].max()
correct_column = pd.DataFrame(correct_column.reset_index())
# correct the data according to the data in the correct column
dades_2019_Gener_info_old = correct_columns(dades_2019_Gener_info_old.copy(), 'station_id', 'street_number', correct_column=correct_column)

dades_2019_Gener_info_old.street_number.fillna('0', inplace=True)

cond = (dades_2019_Gener_info_old.num_bikes_available + dades_2019_Gener_info_old.num_docks_available) > dades_2019_Gener_info_old.capacity
dades_2019_Gener_info_old.loc[cond, ['capacity']] = dades_2019_Gener_info_old[cond]['num_bikes_available'] + dades_2019_Gener_info_old[cond]['num_docks_available']

dades_2019_Gener_info_old = correct_columns(dades_2019_Gener_info_old.copy(), 'station_id', 'capacity')

dades_2019_Gener_info_old.loc[dades_2019_Gener_info_old.capacity.isna(),['capacity']] = 0

dades_2019_Gener_info_old = correct_columns(dades_2019_Gener_info_old.copy(), 'station_id', 'altitude', take='first')

CPU times: user 8.45 s, sys: 1.95 s, total: 10.4 s
Wall time: 10.4 s


In [4]:
print_duplicates(dades_2019_Gener_info_old, ['station_id', 'last_updated'])

physical_configuration                   1
lat                                      1
lon                                      1
street_name                              1
num_docks_available                     12
num_bikes_available                     12
status                                   2
is_installed                             1
is_renting                               1
is_returning                             1
is_charging_station                      1
num_bikes_available_types.mechanical    12
num_bikes_available_types.ebike          1
post_code                                1
year_last_updated_date                   1
month_last_updated_date                  1
dayofweek_last_updated_date              1
dayofmonth_last_updated_date             1
dayofyear_last_updated_date              1
hour_last_updated_date                   1
street_number                            1
capacity                                 1
altitude                                 1
dtype: int6

In [5]:
get_features_nans(dades_2019_Gener_info_old)

{}

In [6]:
get_features_zero(dades_2019_Gener_info_old)

{'physical_configuration': 90.32192978466739,
 'num_docks_available': 3.3194022980303495,
 'num_bikes_available': 23.09616233605195,
 'status': 14.145181103877716,
 'num_bikes_available_types.mechanical': 23.09616233605195,
 'num_bikes_available_types.ebike': 100.0,
 'dayofweek_last_updated_date': 12.922620314419438,
 'hour_last_updated_date': 4.144183577090855,
 'altitude': 3.6940904452763585}

In [7]:
get_columns_unique(dades_2019_Gener_info_old)

{'station_id': array([  1, 320, 319, 318, 317, 316, 315, 314, 313, 312, 311, 310, 321,
        309, 307, 306, 305, 304, 303, 302, 301, 300, 299, 298, 297, 308,
        322, 323, 324, 349, 348, 347, 346, 345, 344, 343, 342, 341, 340,
        339, 338, 337, 336, 335, 334, 333, 332, 331, 330, 329, 328, 327,
        326, 325, 296, 350, 295, 292, 261, 260, 259, 258, 256, 255, 254,
        253, 252, 251, 250, 262, 249, 247, 246, 244, 243, 242, 241, 240,
        239, 238, 237, 236, 248, 263, 264, 265, 291, 289, 288, 287, 286,
        285, 284, 283, 282, 281, 280, 279, 278, 277, 276, 275, 274, 273,
        272, 271, 270, 269, 268, 267, 266, 294, 352, 353, 354, 466, 465,
        464, 463, 462, 461, 460, 459, 458, 457, 456, 467, 455, 453, 452,
        451, 428, 427, 426, 425, 424, 423, 421, 420, 454, 468, 469, 470,
        496, 495, 494, 493, 492, 491, 489, 488, 487, 486, 485, 484, 483,
        482, 481, 480, 479, 478, 477, 476, 475, 474, 473, 472, 471, 419,
        418, 416, 415, 381, 380, 379,

In [8]:
dades_2019_Gener_info_old

Unnamed: 0,station_id,physical_configuration,lat,lon,street_name,num_docks_available,num_bikes_available,status,last_updated,is_installed,is_renting,is_returning,is_charging_station,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,post_code,year_last_updated_date,month_last_updated_date,dayofweek_last_updated_date,dayofmonth_last_updated_date,dayofyear_last_updated_date,hour_last_updated_date,street_number,capacity,altitude
0,1,0,41.397952,2.180042,Gran Via Corts Catalanes,10,16,1,1546300800,1,1,1,1,16,0,0,2019,1,1,1,1,0,760,30,21
1,320,0,41.401118,2.147989,Vía Augusta,26,0,1,1546300800,1,1,1,1,0,0,0,2019,1,1,1,1,0,21,27,80
2,319,0,41.393713,2.145584,Saguès,30,0,1,1546300800,1,1,1,1,0,0,0,2019,1,1,1,1,0,1,33,65
3,318,0,41.413406,2.171331,Cartagena,24,0,1,1546300800,1,1,1,1,0,0,0,2019,1,1,1,1,0,83,26,59
4,317,0,41.425566,2.200693,Rambla Prim,7,19,1,1546300800,1,1,1,1,19,0,0,2019,1,1,1,1,0,256,27,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3979838,151,0,41.400650,2.197190,Pallars,21,3,1,1548975600,1,1,1,1,3,0,0,2019,1,3,31,31,23,462,24,6
3979839,150,0,41.406549,2.203112,Espronceda,20,6,1,1548975600,1,1,1,1,6,0,0,2019,1,3,31,31,23,124,27,10
3979840,149,0,41.395905,2.192958,Pujades,21,4,1,1548975600,1,1,1,1,4,0,0,2019,1,3,31,31,23,57B,27,11
3979841,161,0,41.395009,2.196308,Ramon Turro,27,0,1,1548975600,1,1,1,1,0,0,0,2019,1,3,31,31,23,91,27,11


In [9]:
%%time

intial_size = dades_2019_Gener_info_old.shape[0]
print(dades_2019_Gener_info_old.shape)

dades_2019_Gener_info_old['ttl'] = config.ttl

### will remove the duplicate for last reported for all stations in the dataset
dades_2019_Gener_info_old = remove_duplicates_all(dades_2019_Gener_info_old.copy(), 'last_updated')
# (3729928, 19)

print(dades_2019_Gener_info_old.shape)
print('removed:', intial_size-dades_2019_Gener_info_old.shape[0])

(3979843, 25)


  0%|          | 0/463 [00:00<?, ?it/s]

(342919, 26)
removed: 3636924
CPU times: user 2h 17min 4s, sys: 5.88 s, total: 2h 17min 10s
Wall time: 2h 17min 9s


In [10]:
%%time

dades_2019_Gener_info_old.reset_index(drop=True, inplace=True)

dades_2019_Gener_info_old.drop(['ttl'], axis=1, inplace=True)

# save checkpoint

dades_2019_Gener_info_old.to_csv(f'{config.path}/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', index=False)

CPU times: user 6.72 s, sys: 137 ms, total: 6.85 s
Wall time: 6.88 s


# Visualize data

In [11]:
# re read file
dades_2019_Gener_info_old = pd.read_csv(f'{config.path}/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', low_memory=False)

In [12]:
dades_2019_Gener_info_old

Unnamed: 0,station_id,physical_configuration,lat,lon,street_name,num_docks_available,num_bikes_available,status,last_updated,is_installed,is_renting,is_returning,is_charging_station,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,post_code,year_last_updated_date,month_last_updated_date,dayofweek_last_updated_date,dayofmonth_last_updated_date,dayofyear_last_updated_date,hour_last_updated_date,street_number,capacity,altitude
0,1.0,0.0,41.397952,2.180042,Gran Via Corts Catalanes,8.363636,17.636364,1.0,1.546301e+09,1.0,1.0,1.0,1.0,17.636364,0.0,0,2019.0,1.0,1.0,1.0,1.0,0.0,760,30.0,21.0
1,1.0,0.0,41.397952,2.180042,Gran Via Corts Catalanes,7.181818,17.818182,1.0,1.546304e+09,1.0,1.0,1.0,1.0,17.818182,0.0,0,2019.0,1.0,1.0,1.0,1.0,1.0,760,30.0,21.0
2,1.0,0.0,41.397952,2.180042,Gran Via Corts Catalanes,7.545455,16.818182,1.0,1.546308e+09,1.0,1.0,1.0,1.0,16.818182,0.0,0,2019.0,1.0,1.0,1.0,1.0,2.0,760,30.0,21.0
3,1.0,0.0,41.397952,2.180042,Gran Via Corts Catalanes,8.818182,15.636364,1.0,1.546312e+09,1.0,1.0,1.0,1.0,15.636364,0.0,0,2019.0,1.0,1.0,1.0,1.0,3.0,760,30.0,21.0
4,1.0,0.0,41.397952,2.180042,Gran Via Corts Catalanes,4.818182,21.181818,1.0,1.546315e+09,1.0,1.0,1.0,1.0,21.181818,0.0,0,2019.0,1.0,1.0,1.0,1.0,4.0,760,30.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342914,194.0,0.0,41.381013,2.132319,Joan Güell,25.000000,0.000000,0.0,1.548961e+09,1.0,1.0,1.0,1.0,0.000000,0.0,0,2019.0,1.0,3.0,31.0,31.0,19.0,98,25.0,40.0
342915,194.0,0.0,41.381013,2.132319,Joan Güell,25.000000,0.000000,0.0,1.548965e+09,1.0,1.0,1.0,1.0,0.000000,0.0,0,2019.0,1.0,3.0,31.0,31.0,20.0,98,25.0,40.0
342916,194.0,0.0,41.381013,2.132319,Joan Güell,25.000000,0.000000,0.0,1.548968e+09,1.0,1.0,1.0,1.0,0.000000,0.0,0,2019.0,1.0,3.0,31.0,31.0,21.0,98,25.0,40.0
342917,194.0,0.0,41.381013,2.132319,Joan Güell,25.000000,0.000000,0.0,1.548972e+09,1.0,1.0,1.0,1.0,0.000000,0.0,0,2019.0,1.0,3.0,31.0,31.0,22.0,98,25.0,40.0


In [13]:
print_duplicates(dades_2019_Gener_info_old, ['station_id', 'last_updated'])

physical_configuration                  1
lat                                     1
lon                                     1
street_name                             1
num_docks_available                     1
num_bikes_available                     1
status                                  1
is_installed                            1
is_renting                              1
is_returning                            1
is_charging_station                     1
num_bikes_available_types.mechanical    1
num_bikes_available_types.ebike         1
post_code                               1
year_last_updated_date                  1
month_last_updated_date                 1
dayofweek_last_updated_date             1
dayofmonth_last_updated_date            1
dayofyear_last_updated_date             1
hour_last_updated_date                  1
street_number                           1
capacity                                1
altitude                                1
dtype: int64

In [14]:
get_features_nans(dades_2019_Gener_info_old)

{}

In [15]:
get_features_zero(dades_2019_Gener_info_old)

{'physical_configuration': 90.24609310070308,
 'num_docks_available': 1.3560053540340429,
 'num_bikes_available': 17.612905671601748,
 'status': 13.816673908415689,
 'num_bikes_available_types.mechanical': 17.612905671601748,
 'num_bikes_available_types.ebike': 100.0,
 'post_code': 100.0,
 'dayofweek_last_updated_date': 12.885550231978982,
 'hour_last_updated_date': 4.16687322662203,
 'altitude': 3.688334562972597}

In [16]:
get_columns_unique(dades_2019_Gener_info_old)

{'station_id': array([  1., 320., 319., 318., 317., 316., 315., 314., 313., 312., 311.,
        310., 321., 309., 307., 306., 305., 304., 303., 302., 301., 300.,
        299., 298., 297., 308., 322., 323., 324., 349., 348., 347., 346.,
        345., 344., 343., 342., 341., 340., 339., 338., 337., 336., 335.,
        334., 333., 332., 331., 330., 329., 328., 327., 326., 325., 296.,
        350., 295., 292., 261., 260., 259., 258., 256., 255., 254., 253.,
        252., 251., 250., 262., 249., 247., 246., 244., 243., 242., 241.,
        240., 239., 238., 237., 236., 248., 263., 264., 265., 291., 289.,
        288., 287., 286., 285., 284., 283., 282., 281., 280., 279., 278.,
        277., 276., 275., 274., 273., 272., 271., 270., 269., 268., 267.,
        266., 294., 352., 353., 354., 466., 465., 464., 463., 462., 461.,
        460., 459., 458., 457., 456., 467., 455., 453., 452., 451., 428.,
        427., 426., 425., 424., 423., 421., 420., 454., 468., 469., 470.,
        496., 495., 494.