In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re

from tqdm.notebook import tqdm

sys.path.insert(0, '../../tools/')

from tools import * 

config = pd.Series({
    'devide_by':60,
    'year':2019,
    'datafrom': 'BICING_ESTACIONS',
    'dataset': 'BicingNou_MOD',
    'ttl': 30,
    'month': 2,
    'monthname': 'Febrer',
    'path':'../../dades'
})

os.system(f"mkdir -p {config.path}/{config.year}/{config.dataset}")


0

# Data Febrer 2019 

In [2]:
%%time

dades_2019_Febrer_info_old = pd.read_csv(f'{config.path}/{config.year}/{config.datafrom}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom}.csv', low_memory=False)

intial_size = dades_2019_Febrer_info_old.shape[0]
print(dades_2019_Febrer_info_old.shape)

dades_2019_Febrer_info_old.rename(
    columns={
        'id': 'station_id',
        'latitude':'lat',
        'longitude':'lon',
        'type':'physical_configuration',
        'updateTime':'last_updated',
        'bikes':'num_bikes_available',
        'slots':'num_docks_available',
        'streetName':'street_name',
        'streetNumber':'street_number',
    }, 
    inplace=True
)

# STATUS = IN_SERVICE=En servei, CLOSED=Tancada
# replace IN_SERVICE with 1 and CLOSED with 0
dades_2019_Febrer_info_old['status'].replace(to_replace=['CLS', 'OPN'], value=[0, 1], inplace=True)

dades_2019_Febrer_info_old['physical_configuration'].replace(to_replace=['BIKE', 'BIKE-ELECTRIC'], value=[0, 1], inplace=True)

dades_2019_Febrer_info_old['is_installed'] = 1
dades_2019_Febrer_info_old['is_renting'] = 1
dades_2019_Febrer_info_old['is_returning'] = 1
dades_2019_Febrer_info_old['is_charging_station'] = 1
dades_2019_Febrer_info_old['num_bikes_available_types.mechanical'] = dades_2019_Febrer_info_old['num_bikes_available']
dades_2019_Febrer_info_old['num_bikes_available_types.ebike'] = 0
dades_2019_Febrer_info_old['post_code'] = '0'
dades_2019_Febrer_info_old['capacity'] = 0

# we don't have this column anywhere in the new dataset so it got removed
dades_2019_Febrer_info_old.drop('nearbyStations', axis=1, inplace=True)

dades_2019_Febrer_info_old = convert_timestamp(dades_2019_Febrer_info_old.copy(), ['last_updated'], sort=True, add=True, pattern='%d/%m/%y %H:%M:%S')

# convert timestamps to multimple of 5
dades_2019_Febrer_info_old = timestamp_multipleof(
    devide_by=config.devide_by, 
    column='minutes_last_updated_date',
    df=dades_2019_Febrer_info_old.copy(), 
    new_column='last_updated', 
    year_column='year_last_updated_date',
    month_column='month_last_updated_date',
    day_column='dayofmonth_last_updated_date',
    hour_column='hour_last_updated_date',
    minutes_column='minutes_last_updated_date'
)    

# drop not needed columns
dades_2019_Febrer_info_old.drop(
    [
        'week_last_updated_date', 'minutes_last_updated_date'
    ], 
    axis=1, 
    inplace=True
)

print(dades_2019_Febrer_info_old.shape)
print('removed:', intial_size-dades_2019_Febrer_info_old.shape[0])

(3729928, 12)
(3729928, 25)
removed: 0
CPU times: user 1min 38s, sys: 3.39 s, total: 1min 41s
Wall time: 1min 42s


In [3]:
%%time

# replace nans with string of 0
dades_2019_Febrer_info_old.street_number = dades_2019_Febrer_info_old.street_number.astype(str)
dades_2019_Febrer_info_old.street_number.fillna('0', inplace=True)
dades_2019_Febrer_info_old.loc[dades_2019_Febrer_info_old.street_number.isin(['nan', '.']),['street_number']] = '0' 

# extract the correct column
correct_column = dades_2019_Febrer_info_old[dades_2019_Febrer_info_old.street_number != '0'].groupby(['station_id'])['street_number'].max()
correct_column = pd.DataFrame(correct_column.reset_index())
# correct the data according to the data in the correct column
dades_2019_Febrer_info_old = correct_columns(dades_2019_Febrer_info_old.copy(), 'station_id', 'street_number', correct_column=correct_column)

dades_2019_Febrer_info_old.street_number.fillna('0', inplace=True)

cond = (dades_2019_Febrer_info_old.num_bikes_available + dades_2019_Febrer_info_old.num_docks_available) > dades_2019_Febrer_info_old.capacity
dades_2019_Febrer_info_old.loc[cond, ['capacity']] = dades_2019_Febrer_info_old[cond]['num_bikes_available'] + dades_2019_Febrer_info_old[cond]['num_docks_available']

dades_2019_Febrer_info_old = correct_columns(dades_2019_Febrer_info_old.copy(), 'station_id', 'capacity')

dades_2019_Febrer_info_old.loc[dades_2019_Febrer_info_old.capacity.isna(),['capacity']] = 0

dades_2019_Febrer_info_old = correct_columns(dades_2019_Febrer_info_old.copy(), 'station_id', 'altitude', take='first')

CPU times: user 7.54 s, sys: 3.62 s, total: 11.2 s
Wall time: 11.2 s


In [4]:
print_duplicates(dades_2019_Febrer_info_old.copy(), ['station_id', 'last_updated'])

physical_configuration                   1
lat                                      1
lon                                      1
street_name                              1
num_docks_available                     12
num_bikes_available                     12
status                                   2
is_installed                             1
is_renting                               1
is_returning                             1
is_charging_station                      1
num_bikes_available_types.mechanical    12
num_bikes_available_types.ebike          1
post_code                                1
year_last_updated_date                   1
month_last_updated_date                  1
dayofweek_last_updated_date              1
dayofmonth_last_updated_date             1
dayofyear_last_updated_date              1
hour_last_updated_date                   1
street_number                            1
capacity                                 1
altitude                                 1
dtype: int6

In [5]:
get_features_nans(dades_2019_Febrer_info_old)

{}

In [6]:
get_features_zero(dades_2019_Febrer_info_old)

{'physical_configuration': 90.28077753779698,
 'num_docks_available': 1.1798351067366448,
 'num_bikes_available': 52.254011337484265,
 'status': 49.06311864464944,
 'num_bikes_available_types.mechanical': 52.254011337484265,
 'num_bikes_available_types.ebike': 100.0,
 'dayofweek_last_updated_date': 14.299900695134063,
 'hour_last_updated_date': 4.1459781529294935,
 'altitude': 3.6717062634989204}

In [7]:
get_columns_unique(dades_2019_Febrer_info_old)

{'station_id': array([496, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 149,
        161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 162,
        148, 147, 146, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 174, 175, 176, 177, 207, 208, 209, 210, 211, 212, 213,
        214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226,
        227, 228, 229, 230, 231, 206, 120, 205, 203, 178, 179, 180, 181,
        182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
        195, 196, 197, 198, 199, 200, 201, 202, 204, 119, 118, 117,  32,
         33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
         46,  47,  48,  49,  50,  51,  53,  54,  55,  56,  57,  31,  58,
         30,  28,   2,   3,   4,   5,   6,   7,   8,   9,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  29, 232,  59,  61,  92,

In [8]:
%%time

intial_size = dades_2019_Febrer_info_old.shape[0]
print(dades_2019_Febrer_info_old.shape)

dades_2019_Febrer_info_old['ttl'] = config.ttl

### will remove the duplicate for last reported for all stations in the dataset
dades_2019_Febrer_info_old = remove_duplicates_all(dades_2019_Febrer_info_old.copy(), 'last_updated')
# (3729928, 19)

print(dades_2019_Febrer_info_old.shape)
print('removed:', intial_size-dades_2019_Febrer_info_old.shape[0])

(3729928, 25)


  0%|          | 0/463 [00:00<?, ?it/s]

(311599, 26)
removed: 3418329
CPU times: user 2h 7min 47s, sys: 4.34 s, total: 2h 7min 51s
Wall time: 2h 7min 53s


In [9]:
%%time

dades_2019_Febrer_info_old.reset_index(drop=True, inplace=True)

dades_2019_Febrer_info_old.drop(['ttl'], axis=1, inplace=True)

# save checkpoint

dades_2019_Febrer_info_old.to_csv(f'{config.path}/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', index=False)

CPU times: user 4.77 s, sys: 278 ms, total: 5.05 s
Wall time: 5.06 s


# Visualize data

In [11]:
# re read file
dades_2019_Febrer_info_old = pd.read_csv(f'{config.path}/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', low_memory=False)

In [12]:
dades_2019_Febrer_info_old

Unnamed: 0,station_id,physical_configuration,lat,lon,street_name,num_docks_available,num_bikes_available,status,last_updated,is_installed,is_renting,is_returning,is_charging_station,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,post_code,year_last_updated_date,month_last_updated_date,dayofweek_last_updated_date,dayofmonth_last_updated_date,dayofyear_last_updated_date,hour_last_updated_date,street_number,capacity,altitude
0,496.0,1.0,41.404871,2.175141,C/ DE PROVENÇA,13.000000,11.000000,1.0,1.548976e+09,1.0,1.0,1.0,1.0,11.000000,0.0,0,2019.0,1.0,3.0,31.0,31.0,23.0,445,25.0,21.0
1,496.0,1.0,41.404871,2.175141,C/ DE PROVENÇA,13.000000,11.000000,1.0,1.548979e+09,1.0,1.0,1.0,1.0,11.000000,0.0,0,2019.0,2.0,4.0,1.0,32.0,0.0,445,25.0,21.0
2,496.0,1.0,41.404871,2.175141,C/ DE PROVENÇA,10.583333,13.416667,1.0,1.548983e+09,1.0,1.0,1.0,1.0,13.416667,0.0,0,2019.0,2.0,4.0,1.0,32.0,1.0,445,25.0,21.0
3,496.0,1.0,41.404871,2.175141,C/ DE PROVENÇA,10.000000,14.000000,1.0,1.548986e+09,1.0,1.0,1.0,1.0,14.000000,0.0,0,2019.0,2.0,4.0,1.0,32.0,2.0,445,25.0,21.0
4,496.0,1.0,41.404871,2.175141,C/ DE PROVENÇA,10.000000,14.000000,1.0,1.548990e+09,1.0,1.0,1.0,1.0,14.000000,0.0,0,2019.0,2.0,4.0,1.0,32.0,3.0,445,25.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311594,495.0,1.0,41.377191,2.149283,C/ DIPUTACIÓ - TARRAGONA,12.833333,11.166667,1.0,1.551380e+09,1.0,1.0,1.0,1.0,11.166667,0.0,0,2019.0,2.0,3.0,28.0,59.0,19.0,SN,24.0,45.0
311595,495.0,1.0,41.377191,2.149283,C/ DIPUTACIÓ - TARRAGONA,11.000000,13.000000,1.0,1.551384e+09,1.0,1.0,1.0,1.0,13.000000,0.0,0,2019.0,2.0,3.0,28.0,59.0,20.0,SN,24.0,45.0
311596,495.0,1.0,41.377191,2.149283,C/ DIPUTACIÓ - TARRAGONA,11.000000,13.000000,1.0,1.551388e+09,1.0,1.0,1.0,1.0,13.000000,0.0,0,2019.0,2.0,3.0,28.0,59.0,21.0,SN,24.0,45.0
311597,495.0,1.0,41.377191,2.149283,C/ DIPUTACIÓ - TARRAGONA,10.083333,13.916667,1.0,1.551391e+09,1.0,1.0,1.0,1.0,13.916667,0.0,0,2019.0,2.0,3.0,28.0,59.0,22.0,SN,24.0,45.0


In [13]:
print_duplicates(dades_2019_Febrer_info_old, ['station_id', 'last_updated'])

physical_configuration                  1
lat                                     1
lon                                     1
street_name                             1
num_docks_available                     1
num_bikes_available                     1
status                                  1
is_installed                            1
is_renting                              1
is_returning                            1
is_charging_station                     1
num_bikes_available_types.mechanical    1
num_bikes_available_types.ebike         1
post_code                               1
year_last_updated_date                  1
month_last_updated_date                 1
dayofweek_last_updated_date             1
dayofmonth_last_updated_date            1
dayofyear_last_updated_date             1
hour_last_updated_date                  1
street_number                           1
capacity                                1
altitude                                1
dtype: int64

In [14]:
get_features_nans(dades_2019_Febrer_info_old)

{}

In [15]:
get_features_zero(dades_2019_Febrer_info_old)

{'physical_configuration': 90.28077753779698,
 'num_docks_available': 0.5128386162985119,
 'num_bikes_available': 49.67923517084458,
 'status': 49.0036874316028,
 'num_bikes_available_types.mechanical': 49.67923517084458,
 'num_bikes_available_types.ebike': 100.0,
 'post_code': 100.0,
 'dayofweek_last_updated_date': 14.26448736998514,
 'hour_last_updated_date': 4.160475482912332,
 'altitude': 3.6717062634989204}

In [16]:
get_columns_unique(dades_2019_Febrer_info_old)

{'station_id': array([496., 150., 151., 152., 153., 154., 155., 156., 157., 158., 159.,
        160., 149., 161., 163., 164., 165., 166., 167., 168., 169., 170.,
        171., 172., 173., 162., 148., 147., 146., 121., 122., 123., 124.,
        125., 126., 127., 128., 129., 130., 131., 132., 133., 134., 135.,
        136., 137., 138., 139., 140., 141., 142., 143., 144., 145., 174.,
        175., 176., 177., 207., 208., 209., 210., 211., 212., 213., 214.,
        215., 216., 217., 218., 219., 220., 221., 222., 223., 224., 225.,
        226., 227., 228., 229., 230., 231., 206., 120., 205., 203., 178.,
        179., 180., 181., 182., 183., 184., 185., 186., 187., 188., 189.,
        190., 191., 192., 193., 194., 195., 196., 197., 198., 199., 200.,
        201., 202., 204., 119., 118., 117.,  32.,  33.,  34.,  35.,  36.,
         37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,  45.,  46.,  47.,
         48.,  49.,  50.,  51.,  53.,  54.,  55.,  56.,  57.,  31.,  58.,
         30.,  28.,   2.