In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re

from tqdm.notebook import tqdm

from tools import * 

In [2]:
def read_status_estacion_mes(config:dict):
    if os.path.exists(f'{config.path}/{config.year}/{config.datafrom}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom}.csv'):

        data_df = pd.read_csv(f'{config.path}/{config.year}/{config.datafrom}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom}.csv')

        intial_size = data_df.shape[0]
        print(data_df.shape)

        # change column to one hot enconding
        data_df['is_charging_station'] = data_df.is_charging_station.astype(np.int)

        # STATUS = IN_SERVICE=En servei, CLOSED=Tancada, MAINTENANCE=installed but closed for MAINTENANCE, PLANNED=not installed and closed
        # replace IN_SERVICE with 1 and CLOSED with 0 
        data_df['status'].replace(
            to_replace=['IN_SERVICE', 'OPEN', 'OPN', 'CLS', 'CLOSED', 'NOT_IN_SERVICE', 'MAINTENANCE', 'PLANNED'],                       
            value=[0, 0, 0, 1, 1, 1,  2, 3], inplace=True)

        data_df.loc[data_df.last_reported.isna(), 'last_reported'] = data_df.loc[data_df.last_reported.isna(), 'last_updated']

        # will remove the duplicate for last reported for all stations in the dataset
        data_df = remove_duplicates_all(data_df.copy(), 'last_reported')

        # convert timestamps of last_updated
        data_df = convert_timestamp(data_df.copy(), ['last_updated'], sort=True, add=True)

        # convert timestamps to multimple of 60
        data_df = timestamp_multipleof(
            devide_by=config.devide_by, 
            column='minutes_last_updated_date',
            df=data_df.copy(), 
            new_column='last_updated', 
            year_column='year_last_updated_date',
            month_column='month_last_updated_date',
            day_column='dayofmonth_last_updated_date',
            hour_column='hour_last_updated_date',
            minutes_column='minutes_last_updated_date'
        )

        # print(data_df.minutes_last_updated_date.value_counts())
        data_df.drop(['minutes_last_updated_date', 'week_last_updated_date'], axis=1, inplace=True)

        ### will remove the duplicate for last reported for all stations in the dataset
        data_df = remove_duplicates_all(data_df.copy(), 'last_updated')

        print(data_df.shape)
        print('removed:', intial_size-data_df.shape[0])

        data_df.reset_index(drop=True, inplace=True)

        data_df.drop(['ttl'], axis=1, inplace=True)

        # save checkpoint
        data_df.to_csv(f'{config.path}/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', index=False)
    else:
        print('File not found')

def read_status_informacio_mes(config:dict) -> dd.core.DataFrame:
    if os.path.exists(f'{config.path}/{config.year}/{config.datafrom}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom}.csv'):
        
        data_df = pd.read_csv(f'{config.path}/{config.year}/{config.datafrom}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom}.csv')

        intial_size = data_df.shape[0]
        print(data_df.shape)

        # drop not needed columns
        # data_df.drop(['nearbyStations', 'cross_street'], axis=1, inplace=True)

        data_df.loc[data_df.altitude.isin(['0.1', 'nan', np.nan]), 'altitude'] = '0'
        data_df.altitude = data_df.altitude.astype(np.int).astype(str)

        cond = (~data_df.altitude.isin([str(x) for x in range(200)] + [np.nan]))
        print(data_df[cond].shape)
        # 485 row does not have 0 in the altitud column
        # capacity is filled with values 1 to fix this we need to shift the data 

        # Fix data 
        data_df.loc[cond, ['capacity']] = data_df[cond].post_code
        data_df.loc[cond, ['post_code']] = data_df[cond].address
        data_df.loc[cond, ['address']] = data_df[cond].altitude
        data_df.loc[cond, ['altitude']] = '0'
        data_df.altitude.fillna('0', inplace=True)

        # post code is wrong need fixing using long & lat. 
        # can be fixed using post code data from old dataset after the merge
        data_df['post_code'] = '0'

        data_df = convert_timestamp(data_df.copy(), ['last_updated'], sort=True, add=True)

        # convert timestamps to multimple of 3
        data_df = timestamp_multipleof(
            devide_by=config.devide_by, 
            column='minutes_last_updated_date',
            df=data_df.copy(), 
            new_column='last_updated', 
            year_column='year_last_updated_date',
            month_column='month_last_updated_date',
            day_column='dayofmonth_last_updated_date',
            hour_column='hour_last_updated_date',
            minutes_column='minutes_last_updated_date'
        )

        # drop not needed columns
        data_df.drop(
            [
                'year_last_updated_date', 'month_last_updated_date',
                'week_last_updated_date', 'dayofweek_last_updated_date',
                'dayofmonth_last_updated_date', 'dayofyear_last_updated_date',
                'hour_last_updated_date', 'minutes_last_updated_date'
            ],
            axis=1,
            inplace=True
        )

        data_df['physical_configuration'].replace(to_replace=['REGULAR', 'BIKE','BIKESTATION', 'BIKE-ELECTRIC', 'ELECTRICBIKESTATION'], value=[0, 0, 0, 1, 1], inplace=True)

        # create mew column of last reported and last updated 
        data_df['street_name'] = data_df.apply(
            lambda x: " ".join(re.findall("[a-zA-Z]+", x['name'])),
            axis=1
        )

        def lambda_fun(name):
            ret = 'nan'
            try:
                ret = re.findall("\d+$", name)[0]
            except:
                ret = 'nan'

            return ret

        # create mew column of last reported and last updated 
        data_df['street_number'] = data_df.apply(
            lambda x: lambda_fun(x['name']),
            axis=1
        )

        # we don't have this column anywhere in the new dataset so it got removed
        data_df.drop(['address', 'name'], axis=1, inplace=True)

        ### will remove the duplicate for last reported for all stations in the dataset
        data_df = remove_duplicates_all(data_df.copy(), 'last_updated')

        print(data_df.shape)
        print('removed:', intial_size-data_df.shape[0])

        data_df.reset_index(drop=True, inplace=True)

        data_df.drop(['ttl'], axis=1, inplace=True)

        # save checkpoint
        data_df.to_csv(f'{config.path}/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv', index=False)
    else:
        print('File not found')

def get_file_length(config:dict):
    data_df = pd.read_csv(
        filepath_or_buffer=f'../dades/{config.year}/{config.datafrom}/{config.year}_{config.month:02d}_{config.monthname}_{config.datafrom}.csv',
        header=0,
        low_memory=False,
    )
    return data_df.shape
    
def read_informacion_estacion_anual(input_dataset:str, year:int, path:str='dades'):
    assert input_dataset != ""
    assert year >= 2018 and year <= 2023

    config = pd.Series({
        'devide_by':60,
        'year':year,
        'datafrom': input_dataset,
        'dataset': f'{input_dataset}_CLEAN',
        'ttl': 30,
        'month': np.nan,
        'monthname': np.nan,
        'path':path
    })

    os.system(f"mkdir -p {config.path}/{config.year}/{config.dataset}")

    for month, month_name in i2m:
        config.month = month
        config.monthname = month_name
        print(year, month, month_name, input_dataset)
        if not os.path.exists(f'{config.path}/{config.year}/{config.dataset}/{config.year}_{config.month:02d}_{config.monthname}_{config.dataset}.csv'):
            if input_dataset == 'BicingNou_ESTACIONS':
                read_status_estacion_mes(config)
            elif input_dataset == 'BicingNou_INFORMACIO':
                read_status_informacio_mes(config)
            # TODO add elif para cada dataset que queramso anadir en el futuro ()
        else:
            print('found file with shape equal to: ', get_file_length(config))
            
        print('Done -------- ----------')


In [3]:
%%time

# This function will generate the database cleansed and ready to explore using dask
read_informacion_estacion_anual('BicingNou_ESTACIONS', 2019, '../dades')

# TODO work todo, clean data and prepare all month

2019 1 Gener BicingNou_ESTACIONS
File not found
Done -------- ----------
2019 2 Febrer BicingNou_ESTACIONS
File not found
Done -------- ----------
2019 3 Marc BicingNou_ESTACIONS
found file with shape equal to:  (355467, 13)
Done -------- ----------
2019 4 Abril BicingNou_ESTACIONS
found file with shape equal to:  (3018524, 13)
Done -------- ----------
2019 5 Maig BicingNou_ESTACIONS
found file with shape equal to:  (3553843, 13)
Done -------- ----------
2019 6 Juny BicingNou_ESTACIONS
found file with shape equal to:  (3466316, 13)
Done -------- ----------
2019 7 Juliol BicingNou_ESTACIONS
found file with shape equal to:  (3238510, 13)
Done -------- ----------
2019 8 Agost BicingNou_ESTACIONS
found file with shape equal to:  (3660322, 13)
Done -------- ----------
2019 9 Setembre BicingNou_ESTACIONS
found file with shape equal to:  (3518215, 13)
Done -------- ----------
2019 10 Octubre BicingNou_ESTACIONS
found file with shape equal to:  (3655823, 13)
Done -------- ----------
2019 11 No

  0%|          | 0/410 [00:00<?, ?it/s]

  0%|          | 0/410 [00:00<?, ?it/s]

(291368, 19)
removed: 3225612
Done -------- ----------
2019 12 Desembre BicingNou_ESTACIONS
(3655150, 13)


  0%|          | 0/410 [00:00<?, ?it/s]

  0%|          | 0/410 [00:00<?, ?it/s]

(301527, 19)
removed: 3353623
Done -------- ----------
CPU times: user 40min 17s, sys: 5.65 s, total: 40min 23s
Wall time: 40min 25s


In [4]:
%%time

# This function will generate the database cleansed and ready to explore using dask
read_informacion_estacion_anual('BicingNou_ESTACIONS', 2020, '../dades')


2020 1 Gener BicingNou_ESTACIONS
found file with shape equal to:  (3398708, 13)
Done -------- ----------
2020 2 Febrer BicingNou_ESTACIONS
found file with shape equal to:  (3500841, 13)
Done -------- ----------
2020 3 Marc BicingNou_ESTACIONS
found file with shape equal to:  (2408419, 13)
Done -------- ----------
2020 4 Abril BicingNou_ESTACIONS
found file with shape equal to:  (3892389, 13)
Done -------- ----------
2020 5 Maig BicingNou_ESTACIONS
found file with shape equal to:  (3742388, 13)
Done -------- ----------
2020 6 Juny BicingNou_ESTACIONS
found file with shape equal to:  (4099864, 13)
Done -------- ----------
2020 7 Juliol BicingNou_ESTACIONS
found file with shape equal to:  (4335089, 13)
Done -------- ----------
2020 8 Agost BicingNou_ESTACIONS
found file with shape equal to:  (3869580, 13)
Done -------- ----------
2020 9 Setembre BicingNou_ESTACIONS
found file with shape equal to:  (4246500, 13)
Done -------- ----------
2020 10 Octubre BicingNou_ESTACIONS
found file with s

In [5]:
%%time

# This function will generate the database cleansed and ready to explore using dask
read_informacion_estacion_anual('BicingNou_ESTACIONS', 2021, '../dades')


2021 1 Gener BicingNou_ESTACIONS
found file with shape equal to:  (4509149, 13)
Done -------- ----------
2021 2 Febrer BicingNou_ESTACIONS
found file with shape equal to:  (4017900, 13)
Done -------- ----------
2021 3 Marc BicingNou_ESTACIONS
found file with shape equal to:  (4528030, 13)
Done -------- ----------
2021 4 Abril BicingNou_ESTACIONS
found file with shape equal to:  (4389888, 13)
Done -------- ----------
2021 5 Maig BicingNou_ESTACIONS
found file with shape equal to:  (4538684, 13)
Done -------- ----------
2021 6 Juny BicingNou_ESTACIONS
found file with shape equal to:  (4308619, 13)
Done -------- ----------
2021 7 Juliol BicingNou_ESTACIONS
found file with shape equal to:  (4436678, 13)
Done -------- ----------
2021 8 Agost BicingNou_ESTACIONS
found file with shape equal to:  (4520650, 13)
Done -------- ----------
2021 9 Setembre BicingNou_ESTACIONS
found file with shape equal to:  (4384751, 13)
Done -------- ----------
2021 10 Octubre BicingNou_ESTACIONS
found file with s

In [6]:
%%time

# This function will generate the database cleansed and ready to explore using dask
read_informacion_estacion_anual('BicingNou_ESTACIONS', 2022, '../dades')


2022 1 Gener BicingNou_ESTACIONS
found file with shape equal to:  (4390690, 14)
Done -------- ----------
2022 2 Febrer BicingNou_ESTACIONS
found file with shape equal to:  (3936804, 14)
Done -------- ----------
2022 3 Marc BicingNou_ESTACIONS
found file with shape equal to:  (4535987, 14)
Done -------- ----------
2022 4 Abril BicingNou_ESTACIONS
found file with shape equal to:  (4371369, 14)
Done -------- ----------
2022 5 Maig BicingNou_ESTACIONS
found file with shape equal to:  (4523985, 14)
Done -------- ----------
2022 6 Juny BicingNou_ESTACIONS
found file with shape equal to:  (4382454, 14)
Done -------- ----------
2022 7 Juliol BicingNou_ESTACIONS
found file with shape equal to:  (4524353, 14)
Done -------- ----------
2022 8 Agost BicingNou_ESTACIONS
found file with shape equal to:  (4497691, 14)
Done -------- ----------
2022 9 Setembre BicingNou_ESTACIONS
found file with shape equal to:  (4362921, 14)
Done -------- ----------
2022 10 Octubre BicingNou_ESTACIONS
found file with s