## Loading Data

In [6]:
import os
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import date, datetime

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
i2m = list(zip(range(1,13), ['Gener', 'Febrer', 'Marc', 'Abril', 'Maig', 'Juny', 'Juliol', 'Agost', 'Setembre', 'Octubre', 'Novembre', 'Desembre']))
for year in [2023, 2022, 2021, 2020, 2019]:
    for month, month_name in i2m:        
        os.system(f"wget 'https://opendata-ajuntament.barcelona.cat/resources/bcn/BicingBCN/{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")
        os.system(f"7z x '{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")
        os.system(f"rm '{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")

In [15]:
df = pd.DataFrame()

In [16]:
def get_datetime(miliseconds: int):
    return datetime.fromtimestamp(miliseconds)

def create_date_df(df: pd.DataFrame):
    df['date'] = pd.to_datetime(
        df['last_reported'].apply(lambda x: get_datetime(x))
    )
    
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    
    return df

In [17]:
for csv in tqdm(os.listdir('./data')):
    new_df = pd.read_csv('./data/' + csv)
    
    # Unique values
    new_df = new_df.drop_duplicates()
    new_df = new_df.dropna(subset=['last_reported', 'last_updated'], axis=0)
    new_df = new_df.sort_values('last_reported', ascending=True)
    
    # Convert some categorical into numerical
    new_df.status = np.where(new_df.status == 'IN_SERVICE', 1, 0)
    new_df.is_charging_station = np.where(new_df.is_charging_station, 1, 0)
    
    # Create the dates from timestamp and group statistics
    new_df = create_date_df(new_df)
    new_df = new_df\
        .groupby(['station_id', 'year', 'month', 'day', 'hour'])\
        .mean(numeric_only=True)
    
    df = pd.concat(
        [df, new_df], 
        axis=0
    )


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [09:10<00:00, 11.00s/it]


In [18]:
# Handle NaN: Assume that the NaN in traffic is 0
df.traffic = df.traffic.replace(np.nan, 0)

In [19]:
# Reset which is the index
df = df.reset_index()

In [20]:
df.shape

(16401766, 18)

In [21]:
df.head()

Unnamed: 0,station_id,year,month,day,hour,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,is_renting,is_returning,last_reported,is_charging_station,status,last_updated,ttl,traffic
0,1,2020,5,31,23,9.0,9.0,0.0,35.0,1.0,1.0,1.0,1590962000.0,1.0,1.0,1590962000.0,2.0,0.0
1,1,2020,6,1,0,7.727273,7.727273,0.0,36.272727,1.0,1.0,1.0,1590964000.0,1.0,1.0,1590964000.0,17.363636,0.0
2,1,2020,6,1,1,8.076923,8.076923,0.0,35.923077,1.0,1.0,1.0,1590968000.0,1.0,1.0,1590968000.0,15.461538,0.0
3,1,2020,6,1,2,7.75,7.75,0.0,36.25,1.0,1.0,1.0,1590971000.0,1.0,1.0,1590972000.0,14.083333,0.0
4,1,2020,6,1,3,8.0,8.0,0.0,36.0,1.0,1.0,1.0,1590975000.0,1.0,1.0,1590975000.0,14.25,0.0


### Add station information

In [22]:
def get_station_json(
    url: str = 'https://opendata-ajuntament.barcelona.cat/data/dataset/bd2462df-6e1e-4e37-8205-a4b8e7313b84/resource/e5adca8d-98bf-42c3-9b9c-364ef0a80494/download'
):
    
    res = requests.get(url)
    return res.json()

In [23]:
station_json = get_station_json()

In [24]:
station_df = pd.DataFrame(station_json['data']['stations'])
station_df.head()

Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,post_code,capacity,is_charging_station,nearby_distance,_ride_code_support,rental_uris,cross_street
0,1,"GRAN VIA CORTS CATALANES, 760",ELECTRICBIKESTATION,41.397978,2.180107,16.0,"GRAN VIA CORTS CATALANES, 760",8013,46,True,1000.0,True,,
1,2,"C/ ROGER DE FLOR, 126",ELECTRICBIKESTATION,41.395488,2.177198,17.0,"C/ ROGER DE FLOR, 126",8013,29,True,1000.0,True,,
2,3,"C/ NÀPOLS, 82",ELECTRICBIKESTATION,41.394156,2.181331,11.0,"C/ NÀPOLS, 82",8013,27,True,1000.0,True,,
3,4,"C/ RIBES, 13",ELECTRICBIKESTATION,41.393317,2.181248,8.0,"C/ RIBES, 13",8013,21,True,1000.0,True,,
4,5,"PG. LLUIS COMPANYS, 11 (ARC TRIOMF)",ELECTRICBIKESTATION,41.391103,2.180176,7.0,"PG. LLUIS COMPANYS, 11 (ARC TRIOMF)",8018,39,True,1000.0,True,,


In [25]:
# Some fields are not going to be used
station_drop_fields = [
    'physical_configuration',          # unique value: ELECTRICBIKESTATION
    '_ride_code_support',              # unique value: all True
    'nearby_distance',                 # unique value: 1000
    'name', 'address', 'post_code',    # too specific to each of the stations
    'is_charging_station',             # already in the main df
    'rental_uris', 'cross_street',     # vast majority are none
    
]

In [26]:
station_df = station_df.drop(station_drop_fields, axis=1)
station_df.head()

Unnamed: 0,station_id,lat,lon,altitude,capacity
0,1,41.397978,2.180107,16.0,46
1,2,41.395488,2.177198,17.0,29
2,3,41.394156,2.181331,11.0,27
3,4,41.393317,2.181248,8.0,21
4,5,41.391103,2.180176,7.0,39


In [27]:
df = pd.merge(df, station_df)

In [28]:
df.head()

Unnamed: 0,station_id,year,month,day,hour,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,...,last_reported,is_charging_station,status,last_updated,ttl,traffic,lat,lon,altitude,capacity
0,1,2020,5,31,23,9.0,9.0,0.0,35.0,1.0,...,1590962000.0,1.0,1.0,1590962000.0,2.0,0.0,41.397978,2.180107,16.0,46
1,1,2020,6,1,0,7.727273,7.727273,0.0,36.272727,1.0,...,1590964000.0,1.0,1.0,1590964000.0,17.363636,0.0,41.397978,2.180107,16.0,46
2,1,2020,6,1,1,8.076923,8.076923,0.0,35.923077,1.0,...,1590968000.0,1.0,1.0,1590968000.0,15.461538,0.0,41.397978,2.180107,16.0,46
3,1,2020,6,1,2,7.75,7.75,0.0,36.25,1.0,...,1590971000.0,1.0,1.0,1590972000.0,14.083333,0.0,41.397978,2.180107,16.0,46
4,1,2020,6,1,3,8.0,8.0,0.0,36.0,1.0,...,1590975000.0,1.0,1.0,1590975000.0,14.25,0.0,41.397978,2.180107,16.0,46


### Add Weather information

### Add Covid Information

### Datetime Information

In [51]:
def create_date_time(row):
    return datetime(
        int(row['year']),
        int(row['month']),
        int(row['day']),
        int(row['hour'])
    )

In [49]:
df['date_time'] = df.apply(lambda row: create_date_time(row), axis=1)

Related to the **day** (weekend).

In [75]:
def get_day_info(date_time):
    return 'weekend' if date_time.weekday() in [5, 6] else 'weekday'

In [76]:
df['day_info'] = df.date_time.apply(lambda x: get_day_info(x))

Related to the **hour** (which time of day it was).

In [74]:
def get_hour_info(date_time):
    if date_time.hour in range(5):
        return 'late_night'

    elif date_time.hour in range(5, 9):
        return 'early_morning'

    elif date_time.hour in range(9, 13):
        return 'morning'
    
    elif date_time.hour in range(13, 17):
        return 'noon'
    
    elif date_time.hour in range(17, 21):
        return 'eve'
    
    return 'night'

In [77]:
df['hour_info'] = df.date_time.apply(lambda x: get_hour_info(x))

Related to the **month** (season).

In [79]:
def get_month_info(date):
    if date.month in (3, 4, 5):
        return 'spring'
    
    elif date.month in (6, 7, 8):
        return 'summer'
    
    elif date.month in (9, 10, 11):
        return 'autumn'
    
    return 'winter'

In [80]:
df['month_info'] = df.date_time.apply(lambda x: get_month_info(x))

In [81]:
df.head()

Unnamed: 0,station_id,year,month,day,hour,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,...,is_weekend,is_late_night,is_early_morning,is_morning,is_noon,is_eve,is_night,day_info,hour_info,month_info
0,1,2020,5,31,23,9.0,9.0,0.0,35.0,1.0,...,1,0,0,0,0,0,1,weekend,night,spring
1,1,2020,6,1,0,7.727273,7.727273,0.0,36.272727,1.0,...,0,1,0,0,0,0,0,weekday,late_night,summer
2,1,2020,6,1,1,8.076923,8.076923,0.0,35.923077,1.0,...,0,1,0,0,0,0,0,weekday,late_night,summer
3,1,2020,6,1,2,7.75,7.75,0.0,36.25,1.0,...,0,1,0,0,0,0,0,weekday,late_night,summer
4,1,2020,6,1,3,8.0,8.0,0.0,36.0,1.0,...,0,1,0,0,0,0,0,weekday,late_night,summer


### Train / Val / Test Split

In [None]:
df.head()

**TODO**: 
- Window function: ctx-4, ctx-3, ...

### Exploration