In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import os

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import scipy.stats as stats

import seaborn as sns

from datetime import datetime

import re

from tqdm.notebook import tqdm

import dask.dataframe as dd
from dask.distributed import Client

sys.path.insert(0, 'tools/')

from tools import * 

In [2]:
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:35631  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 16  Memory: 16.44 GB


# Merge

In [3]:
#Correlación
def calculate_correlation(data, variable1, variable2):
    correlation = data[variable1].corr(data[variable2]).compute()
    return print(f"Correlación entre {variable1} y {variable2}: {correlation}")

In [4]:
def matriz_correlation(data, variable1, variable2):
    data_subset = data[[variable1, variable2]]
    correlation = data_subset.corr().compute()
    print(correlation)

In [5]:
config = pd.Series({
    'path': os.path.abspath('dades'),
    'file_type':'csv',
    'years':[2019, 2021, 2022],
    'dataset': 'BicingNou_ESTACIONS_MOD'
})

In [6]:
%%time

def read_data_all(cnfg:dict):

    data = dict()

    for year in cnfg.years:
        cnfg['year'] = year
        data[year] = load_checkpoint(cnfg)
    
    return dd.concat(list(data.values()), interleave_partitions=False)
    
bbdd_completa = read_data_all(config)   

checkpoint reloaded.
checkpoint reloaded.
checkpoint reloaded.
CPU times: user 118 ms, sys: 0 ns, total: 118 ms
Wall time: 112 ms


In [7]:
%%time

index=0
before = get_ddf_shape(bbdd_completa)
print(before)

index+=1
print(index)
# 1 Error fix. # 2019 gener, febrer and marc have the status reversed
cond = (bbdd_completa.year == 2019) & (bbdd_completa.month.isin([1,2,3]))
bbdd_completa['status'] = bbdd_completa['status'].mask(cond, ((bbdd_completa['status'] + 1) %2))
index+=1
print(index)
# 2 Remove not needed status, manitenance, planned. open = 0, closed = 1 
cond = (bbdd_completa['status'].isin([2.0,3.0]))
bbdd_completa['status'] = bbdd_completa['status'].mask(cond, 1.0)
# bbdd_completa = bbdd_completa.dropna(subset=['status'])
index+=1
print(index)
# 3 Remove data from 2020
cond = (bbdd_completa['year'].isin([2020]))
bbdd_completa = bbdd_completa.mask(cond, np.nan)
bbdd_completa = bbdd_completa.dropna(subset=['year'])
index+=1
print(index)
# 4 2019 to 2020 to have continuos data 
cond = (bbdd_completa['year'].isin([2019]))
bbdd_completa['year'] = bbdd_completa['year'].mask(cond, bbdd_completa['year']+1)
index+=1
print(index)
# 5 status reversed 
bbdd_completa['status'] = (bbdd_completa['status'] + 1)%2
index+=1
print(index)
# 6 Column has all ones
bbdd_completa = bbdd_completa.drop(columns=['is_installed'])
index+=1
print(index)
# 7 Column has all ones
bbdd_completa = bbdd_completa.drop(columns=['is_charging_station'])
index+=1
print(index)
# 8 Remove row where status = closed and is renting and is returning
# cond = (bbdd_completa['status'].isin([0.0])) & (bbdd_completa['is_renting'].isin([1.0])) & (bbdd_completa['is_returning'].isin([1.0]))
# bbdd_completa = bbdd_completa.mask(cond, np.nan)
# bbdd_completa = bbdd_completa.dropna(subset=['status'])

after = get_ddf_shape(bbdd_completa)
print(after)

print('Changes to dataframe durinf preprocessing')
print(f'dropeed {(after[0]-before[0]):02d} rows')
print(f'dropped {(after[1]-before[1]):02d} columns')

(12419994, 23)
1
2
3
4
5
6
7
8
(12419489, 21)
Changes to dataframe durinf preprocessing
dropeed -505 rows
dropped -2 columns
CPU times: user 5.6 s, sys: 256 ms, total: 5.86 s
Wall time: 17.3 s


In [8]:
%%time

unique_ids_by_year = bbdd_completa.groupby('year')['station_id'].unique().compute()
print(unique_ids_by_year)

# Calcula los station_id comunes en todos los años
common_ids = set(unique_ids_by_year[2020])
for year, ids in unique_ids_by_year.items():
    print(year, ids.shape)
    common_ids = common_ids.intersection(set(ids))
# common_ids
print(len(common_ids))

filtered_bbdd = bbdd_completa[bbdd_completa['station_id'].isin(common_ids)]
nonfiltered_bbdd = bbdd_completa[~bbdd_completa['station_id'].isin(common_ids)]

year
2020.0    [1, 320, 319, 318, 317, 316, 315, 314, 313, 31...
2021.0    [1.0, 271.0, 400.0, 510.0, 26.0, 270.0, 269.0,...
2022.0    [1, 380, 28, 379, 378, 377, 376, 375, 29, 374,...
Name: station_id, dtype: object
2020.0 (410,)
2021.0 (509,)
2022.0 (510,)
408
CPU times: user 1.58 s, sys: 134 ms, total: 1.72 s
Wall time: 9.24 s


In [21]:
%%time

before = get_ddf_shape(filtered_bbdd)
print(before)

filtered_bbdd = filtered_bbdd.drop_duplicates(subset=['timestamp', 'station_id'], keep='last')

after = get_ddf_shape(filtered_bbdd)
print(after)

print('Changes to dataframe durinf preprocessing')
print(f'dropeed {(after[0]-before[0]):02d} rows')
print(f'dropped {(after[1]-before[1]):02d} columns')



KilledWorker: ("('drop-duplicates-agg-184be3ca62cfabcca102b28a57c0cabc', 0)", <Worker 'tcp://127.0.0.1:36589', name: 3, memory: 0, processing: 1>)

In [10]:
%%time

before = get_ddf_shape(nonfiltered_bbdd)
print(before)

nonfiltered_bbdd = nonfiltered_bbdd.drop_duplicates(subset=['timestamp', 'station_id'], keep='last')

after = get_ddf_shape(nonfiltered_bbdd)
print(after)

print('Changes to dataframe durinf preprocessing')
print(f'dropeed {(after[0]-before[0]):02d} rows')
print(f'dropped {(after[1]-before[1]):02d} columns')

(1791834, 21)
(1771494, 21)
Changes to dataframe durinf preprocessing
dropeed -20340 rows
dropped 00 columns
CPU times: user 9.9 s, sys: 665 ms, total: 10.6 s
Wall time: 24.5 s


## General view

In [11]:
bbdd_completa.columns

Index(['station_id', 'num_docks_available', 'num_bikes_available', 'status',
       'timestamp', 'is_renting', 'is_returning',
       'num_bikes_available_types.mechanical',
       'num_bikes_available_types.ebike', 'year', 'month', 'dayofweek', 'day',
       'dayofyear', 'hour', 'capacity', 'ctx0', 'ctx1', 'ctx2', 'ctx3',
       'ctx4'],
      dtype='object')

In [12]:

len(bbdd_completa.divisions) # divisions mark the start and end of each partiotion
# in our case we have nones but that is okay

61

In [13]:
bbdd_completa.head(2)

Unnamed: 0,station_id,num_docks_available,num_bikes_available,status,timestamp,is_renting,is_returning,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,year,month,dayofweek,day,dayofyear,hour,capacity,ctx0,ctx1,ctx2,ctx3,ctx4
0,1,8.363636,17.636364,1.0,1546300800,1,1,17.636364,0.0,2020,1,1,1,1,0,46.0,0.181818,0.181818,0.181818,0.181818,0.181818
1,1,0.818182,26.181818,1.0,1546380000,1,1,26.181818,0.0,2020,1,1,1,1,22,46.0,0.017787,0.021739,0.023715,0.106719,0.094203


In [14]:
%%time

get_ddf_shape(bbdd_completa)

CPU times: user 4.4 s, sys: 335 ms, total: 4.73 s
Wall time: 11.3 s


(12419489, 21)

In [15]:
%%time

nans = get_features_nans(bbdd_completa)

zeros = get_features_zero(bbdd_completa)

CPU times: user 11.7 s, sys: 845 ms, total: 12.5 s
Wall time: 39.3 s


In [16]:
nans

{}

In [17]:
zeros

{'num_docks_available': 1.0849077606977227,
 'num_bikes_available': 6.601189469228565,
 'status': 3.552247600525271,
 'is_renting': 0.0620879007179764,
 'is_returning': 0.06201543396833799,
 'num_bikes_available_types.mechanical': 16.168338326963372,
 'num_bikes_available_types.ebike': 30.1792448948584,
 'dayofweek': 14.259668815681547,
 'hour': 4.314356250889227,
 'ctx0': 1.0849077606977227,
 'ctx1': 1.0829592103185566,
 'ctx2': 1.0801491108048005,
 'ctx3': 1.0772826482635478,
 'ctx4': 1.0744886524719335}

In [18]:
%%time 

uniques = get_columns_unique(bbdd_completa)

CPU times: user 31 s, sys: 1.89 s, total: 32.9 s
Wall time: 2min 59s


In [19]:
uniques

{'station_id': 0        1.0
 1      320.0
 2      319.0
 3      318.0
 4      317.0
        ...  
 507    446.0
 508     91.0
 509    467.0
 510     52.0
 511    431.0
 Name: station_id, Length: 512, dtype: float64,
 'num_docks_available': 0         8.363636
 1         0.818182
 2         0.000000
 3         6.818182
 4         7.909091
            ...    
 12304    36.266667
 12305    19.151515
 12306     3.823529
 12307    24.433333
 12308     7.888889
 Name: num_docks_available, Length: 12309, dtype: float64,
 'num_bikes_available': 0        17.636364
 1        26.181818
 2        27.000000
 3        21.181818
 4        20.181818
            ...    
 10123    29.333333
 10124     4.757576
 10125    24.176471
 10126     2.566667
 10127    52.708333
 Name: num_bikes_available, Length: 10128, dtype: float64,
 'status': 0    1.0
 1    0.0
 Name: status, dtype: float64,
 'timestamp': 0        1.546301e+09
 1        1.546380e+09
 2        1.546402e+09
 3        1.546416e+09
 4        1.54

In [20]:
xticks = np.asanyarray(
    (
        uniques['timestamp'].min(), 
        np.round(uniques['timestamp'].mean()-uniques['timestamp'].std()).astype(int),
        np.round(uniques['timestamp'].mean()).astype(int),
        np.round(uniques['timestamp'].mean()+uniques['timestamp'].std()).astype(int),
        uniques['timestamp'].max()
    )
)
xticks

array([1.54630080e+09, 1.57463877e+09, 1.61498426e+09, 1.65532975e+09,
       1.67252760e+09])

# Station IDs

In [None]:
%%time

print(stats.describe(bbdd_completa.station_id))
bbdd_completa.station_id.describe().compute()

In [None]:
%%time

show_counts(bbdd_completa.station_id.compute())

# IDs comunes

In [None]:
%%time

get_ddf_shape(filtered_bbdd)
#bbdd_completa.shape = (12531377, 23)

In [None]:
%%time

scatter_columns(
    filtered_bbdd, 
    'timestamp', 
    'station_id', 
    'station_id', 
    tail=True, 
    xticks=xticks,
    figsize=(10,8)
)

## IDs no comunes (testing)

In [None]:
%%time

get_ddf_shape(nonfiltered_bbdd)
#bbdd_completa.shape = (12531377, 23)

In [None]:
%%time

scatter_columns(
    nonfiltered_bbdd, 
    'timestamp', 
    'station_id', 
    'station_id', 
    tail=True, 
    xticks=xticks,
    figsize=(10,8)
)

# Data exploration__________________________________

## · Descriptiva
Media, mediana, desviación estándar, mínimo y máximo para columnas numéricas Idea general de la distribución y el rango de los valores en esas columnas.

<div>
    <table>
        <tr>
            <th><p>Camp<p><th>
            <th><p>Descripció<p><th>
        <tr>
        <tr>
            <td><p>last_updated<p><td>
            <td><p>Timestamp de l'arxiu<p><td>
        <tr>
        <tr>
            <td><p>ttl<p><td>
            <td><p>TimeToLive de la resposta<p><td>
        <tr>
        <tr>
            <td><p>data<p><td>
            <td><p>Contenidor d'arrays d'informació d'estacions<p><td>
        <tr>
        <tr>
            <td><p>stations<p><td>
            <td><p>Array de dades de cada estació<p><td>
        <tr>
        <tr>
            <td><p>station_id<p><td>
            <td><p>Identificador de l'estació<p><td>
        <tr>
        <tr>
            <td><p>num_bikes_available<p><td>
            <td><p>Nombre de bicicletes disponibles<p><td>
        <tr>
        <tr>
            <td><p>num_bikes_available_types<p><td>
            <td><p>Array de tipus de bicicletes disponibles<p><td>
        <tr>
        <tr>
            <td><p>mechanical<p><td>
            <td><p>Nombre de bicicletes mecàniques disponibles<p><td>
        <tr>
        <tr>
            <td><p>ebike<p><td>
            <td><p>Nombre de bicicletes elèctriques disponibles<p><td>
        <tr>
        <tr>
            <td><p>num_docks_available<p><td>
            <td><p>Nombre de ancoratges disponibles<p><td>
        <tr>
        <tr>
            <td><p>is_installed<p><td>
            <td><p>L'estació està correctament instalada (0-NO,1-SI)<p><td>
        <tr>
        <tr>
            <td><p>is_renting<p><td>
            <td><p>L'estació està proporcionant bicicletes correctament<p><td>
        <tr>
        <tr>
            <td><p>is_returning<p><td>
            <td><p>L'estació està ancorant bicicletes correctament<p><td>
        <tr>
        <tr>
            <td><p>last_reported<p><td>
            <td><p>Timestamp de la informació de l'estació<p><td>
        <tr>
        <tr>
            <td><p>is_charging_station<p><td>
            <td><p>L'estació té capacitat de càrrega de bicicletes elèctriques<p><td>
        <tr>
        <tr>
            <td><p>status<p><td>
            <td><p>
                Estat de l'estació (IN_SERVICE=En servei, CLOSED=Tancada)
                 ['IN_SERVICE', 'OPEN', 'OPN', 'CLS', 'CLOSED', 'NOT_IN_SERVICE', 'MAINTENANCE', 'PLANNED']                     [0, 0, 0, 1, 1, 1,  2, 3]
                <p><td>
        <tr>
    <table>
<div>

# Time columns 

## 'timestamp'

In [None]:
%%time
print(stats.describe(filtered_bbdd.timestamp))
filtered_bbdd.timestamp.describe()

show_counts(filtered_bbdd.timestamp.compute())

## 'year' 

In [None]:
%%time
print(stats.describe(filtered_bbdd.year))
filtered_bbdd.year.describe()
show_counts(filtered_bbdd.year.compute())

## 'month'

In [None]:
%%time
print(stats.describe(filtered_bbdd.month))
filtered_bbdd.month.describe()
show_counts(filtered_bbdd.month.compute())

## 'dayofweek' 

In [None]:
%%time
print(stats.describe(filtered_bbdd.dayofweek))
filtered_bbdd.dayofweek.describe()
show_counts(filtered_bbdd.dayofweek.compute())

## 'day'

In [None]:
%%time
print(stats.describe(filtered_bbdd.day))
filtered_bbdd.day.describe()
show_counts(filtered_bbdd.day.compute())

## 'dayofyear' 

In [None]:
%%time
print(stats.describe(filtered_bbdd.dayofyear))
filtered_bbdd.dayofyear.describe()
show_counts(filtered_bbdd.dayofyear.compute())

## 'hour'

In [None]:
%%time
print(stats.describe(filtered_bbdd.hour))
filtered_bbdd.hour.describe()
show_counts(filtered_bbdd.hour.compute())

# Status Columns 

## 'status'

In [None]:
%%time
print(stats.describe(filtered_bbdd.status))
filtered_bbdd.status.describe()
show_counts(filtered_bbdd.status.compute())

## 'is_installed'

## 'is_renting'

In [None]:
%%time
print(stats.describe(filtered_bbdd.is_renting))
filtered_bbdd.is_renting.describe()
show_counts(filtered_bbdd.is_renting.compute())

## 'is_returning'

In [None]:
%%time
print(stats.describe(filtered_bbdd.is_returning))
filtered_bbdd.is_returning.describe()
show_counts(filtered_bbdd.is_returning.compute())

## 'is_charging_station'

## Study 

In [None]:
def filter_data(data, row_data, row_index):
    cat_cols = row_data.select_dtypes(include=['object']).columns
    num_cols = row_data.select_dtypes(exclude=['object']).columns
    
    filtered = data.copy()
    
    row = row_data.loc[row_index]
    for num_col in num_cols:
        filtered = filtered[filtered[num_col] == row[num_col]]
    
    for cat_col in cat_cols:
        filtered = filtered[filtered[cat_col].isin(row[cat_col])]
    
    return filtered

In [None]:
filtered_bbdd.groupby(
    [         
        'status',
        'is_renting', 
        'is_returning'
    ]
).station_id.sum().compute().reset_index().sort_values(['status','is_renting','is_returning']).reset_index(drop=True)


In [None]:
status_of_station = filtered_bbdd.groupby(
    [         
        'status',
        'is_renting', 
        'is_returning'
    ]
)['station_id'].unique().compute()

status_of_station = pd.DataFrame(status_of_station).reset_index()
status_of_station = status_of_station.sort_values(['status','is_renting','is_returning']).reset_index(drop=True)
status_of_station

In [None]:
## view of the stations status

In [None]:
%%time

# status(r) 	is_renting 	is_returning 	
# 0.0 	          0.0 	        0.0 	
filtered = filter_data(filtered_bbdd, status_of_station, 0)

scatter_columns(
    filtered, 
    'timestamp', 
    'station_id', 
    'station_id', 
    tail=False, 
    xticks=xticks,
    figsize=(10,8),
    count=20
)

In [None]:
%%time

# status(r) 	is_installed 	is_renting 	is_returning 	
# 0.0 	1.0 	1.0
filtered = filter_data(filtered_bbdd, status_of_station, 1)

scatter_columns(
    filtered, 
    'timestamp', 
    'station_id', 
    'station_id', 
    tail=False, 
    xticks=xticks,
    figsize=(10,8),
    count=5
)

In [None]:
%%time

# status(r) 	is_installed 	is_renting 	is_returning 	
# 1.0 	0.0 	0.0
filtered = filter_data(filtered_bbdd, status_of_station, 2)

scatter_columns(
    filtered, 
    'timestamp', 
    'station_id', 
    'station_id', 
    tail=False, 
    xticks=xticks,
    figsize=(10,8),
    count=5
)

In [None]:
%%time

# status(r) 	is_installed 	is_renting 	is_returning 	
# 1.0 	0.0 	1.0
filtered = filter_data(filtered_bbdd, status_of_station, 3)

scatter_columns(
    filtered, 
    'timestamp', 
    'station_id', 
    'station_id', 
    tail=False, 
    xticks=xticks,
    figsize=(10,8),
    count=5
)

# Goal Columns 

## 'num_docks_available' 

In [None]:
%%time
print(stats.describe(filtered_bbdd.num_docks_available))
filtered_bbdd.num_docks_available.describe()
show_counts(filtered_bbdd.num_docks_available.compute())

## 'num_bikes_available' 

In [None]:
%%time
print(stats.describe(filtered_bbdd.num_bikes_available))
filtered_bbdd.num_bikes_available.describe()
show_counts(filtered_bbdd.num_bikes_available.compute())

## 'num_bikes_available_types.mechanical'

In [None]:
%%time
print(stats.describe(filtered_bbdd['num_bikes_available_types.mechanical']))
filtered_bbdd['num_bikes_available_types.mechanical'].describe()
show_counts(filtered_bbdd['num_bikes_available_types.mechanical'].compute())

## 'num_bikes_available_types.ebike'

In [None]:
%%time
print(stats.describe(filtered_bbdd['num_bikes_available_types.ebike']))
filtered_bbdd['num_bikes_available_types.ebike'].describe()
show_counts(filtered_bbdd['num_bikes_available_types.ebike'].compute())

## 'capacity' 

In [None]:
%%time
print(stats.describe(filtered_bbdd.capacity))
filtered_bbdd.capacity.describe()
show_counts(filtered_bbdd.capacity.compute())

## 'ctx0'

In [None]:
%%time
print(stats.describe(filtered_bbdd.ctx0))
filtered_bbdd.ctx0.describe()
show_counts(filtered_bbdd.ctx0.compute())

## · Visualización de la disponibilidad de bicicletas
Gráficos para visualizar la disponibilidad de bicicletas a lo largo del tiempo. Ejemplo: cantidad de bicicletas disponibles en función del tiempo para identificar patrones diarios, semanales o estacionales.

## · Análisis temporal
Datos por año, mes, día de la semana o hora del día para conocer estadísticas agregadas, como el promedio de bicicletas disponibles o el porcentaje de anclajes disponibles. Tendencias temporales y patrones de uso.

## · Relación entre variables
Explorar si hay una correlación entre el número de anclajes disponibles (num_docks_available) y el número de bicicletas disponibles (num_bikes_available). Investigar si la disponibilidad de bicicletas difiere entre los diferentes tipos de bicicletas (mecánicas y eléctricas).

## · Análisis de capacidad y porcentaje de anclajes disponibles
Capacidad (capacity) y los porcentajes de anclajes disponibles (ctx0, ctx1, ctx2, ctx3, ctx4). Promedio distribución de la capacidad y los porcentajes de anclajes disponibles. Objetivo: entender la utilización de las estaciones de bicicletas a lo largo del tiempo.

## Data Cleaning

In [None]:
import ctypes

def trim_memory() -> int:
    libc = ctypes.CDLL("libc.so.6")
    return libc.malloc_trim(0)

client.run(trim_memory)

In [None]:
%%time

stations = filtered_bbdd[['timestamp', 'station_id', 'ctx0']].compute()

In [None]:
%%time 

# test 

cond = stations.station_id.isin([1])
station_data = stations[cond]
station_group = station_data.groupby(['timestamp', 'station_id'])
station_counts = station_group.ctx0.count()
station_mean = station_group.ctx0.mean()
(station_counts > 1).any()

In [None]:
station_counts = station_counts.reset_index()
station_counts

In [None]:
station_counts.timestamp.to_list()

In [None]:
station_mean = station_mean.reset_index()
station_mean

In [None]:
station_mean[(station_counts.ctx0 > 1)]

In [None]:
%%time

stations = filtered_bbdd[['timestamp', 'station_id', 'ctx0']].compute()

In [None]:
stations.columns

In [None]:
stations.shape

In [None]:
stations.drop_duplicates(subset=['timestamp', 'station_id'], keep='last', inplace=True)

In [None]:
stations.shape

In [None]:
stations.index.to_list()

In [None]:
get_ddf_shape(filtered_bbdd)

In [None]:
get_ddf_shape(nonfiltered_bbdd)

# Datos adicionales
Tiempo, festivos Barcelona