##  Bike Availability Prediction

1. last_updated : Timestamp of the file
2. ttl : TimeToLive of the response
3. data : Station information array container
    * station_id : Station identifier
    * num_bikes_available : Number of bikes available
    * num_bikes_available_types: Array of available bike types
    * mechanical : Number of mechanical bicycles available
    * ebike: Number of electric bikes available
    * num_docks_available: Number of available docks
    * is_installed: The station is correctly installed (0-NO,1-YES)
    * is_renting: Station is successfully providing bikes
    * is_returning: The station is successfully docking bikes
    * last_reported: Timestamp of station information
    * is_charging_station: The station has electric bike charging capacity
    * status: Status of the station (IN_SERVICE=In service, CLOSED=Closed)

In [1]:
# Import libraries
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

import pandas as pd
import numpy as np
import urllib.request
import zipfile
import os
from tqdm.notebook import tqdm
import pyarrow.parquet as pq
import py7zr
import zipfile
from pathlib import Path
import datetime



ModuleNotFoundError: No module named 'py7zr'

In [86]:
# Ignore useless warnings (see SciPy issue #5998)
import warnings
from sklearn.exceptions import DataConversionWarning
from pandas.core.common import SettingWithCopyWarning
# warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=SettingWithCopyWarning)

In [87]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from time import time

In [88]:
import shapefile
from shapely.geometry import Polygon
from descartes.patch import PolygonPatch
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
YEARS = [2022, 2021, 2020, 2019]

In [4]:
import os

i2m = list(zip(range(1,13), ['Gener','Febrer','Marc','Abril','Maig','Juny','Juliol','Agost','Setembre','Octubre','Novembre','Desembre']))
for year in tqdm(YEARS):
    if not os.path.exists(f'data/{year}'):
        os.makedirs(f'data/{year}', exist_ok=True)
    for month, month_name in tqdm(i2m):
        if not os.path.exists(f'data/{year}/{month:02d}'):
            os.makedirs(f'data/{year}/{month:02d}', exist_ok=True)
        if not os.path.exists(f'data/{year}/{month:02d}/{month:02d}_INFO.7z'):
            try:
                urllib.request.urlretrieve(f'https://opendata-ajuntament.barcelona.cat/resources/bcn/BicingBCN/{year}_{month:02d}_{month_name}_BicingNou_INFORMACIO.7z', f'data/{year}/{month:02d}/{month:02d}_INFO.7z')
            except:
                print(month_name, year, 'Not available INFO')
                
        if not os.path.exists(f'data/{year}/{month:02d}/{month:02d}.7z'):
            try:
                urllib.request.urlretrieve(f'https://opendata-ajuntament.barcelona.cat/resources/bcn/BicingBCN/{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z', f'data/{year}/{month:02d}/{month:02d}.7z')
            except:
                print(month_name, year, 'Not available')

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Gener 2019 Not available INFO
Gener 2019 Not available
Febrer 2019 Not available INFO
Febrer 2019 Not available


In [5]:
columns_info = ['station_id', 'name', 'physical_configuration', 'lat', 'lon', 'altitude', 'address', 'post_code', 'capacity', 'is_charging_station','nearby_distance', 'cross_street', 'last_updated','ttl']

In [6]:
def load_information(year, month):
    """
    Function that reads the downloaded data and converts it to a DataFrame
    """
    if year==2019:
        if month==6:
            path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_NouBicing_INFORMACIO.csv'
        elif month<7:
            path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_BICING2_INFO.csv'
        else:
            path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_BicingNou_INFORMACIO.csv'
    else:
        path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_BicingNou_INFORMACIO.csv'
    
    try:
        if not os.path.exists(path):
            z = py7zr.SevenZipFile(f'data/{year}/{month:02d}/{month:02d}_INFO.7z', 'r')
            z.extractall(path=f'data/{year}/{month:02d}/')
    except:
        print('No such file or directory: ',f'{year}_{month:02d}_{i2m[month-1][1]}_BicingNou_INFORMACIO.csv')
        return []
        
    try:
        if month==3:
            if year==2019:
                os.rename(f'data/{year}/{month:02d}/{year}_{month:02d}_Març_BICING2_INFO.csv',path)
            else:
                os.rename(f'data/{year}/{month:02d}/{year}_{month:02d}_Març_BicingNou_INFORMACIO.csv',path)
    except:
        print(f'The name of the file is with {i2m[month-1][1]}')
    
    try:
        if (year*month==2022): # Gener 2022
            df = pd.read_csv(path, encoding='utf8', encoding_errors='replace', usecols = [i for i in range(13)])
        else:
            df = pd.read_csv(path, encoding='utf8', encoding_errors='replace')
        df = df.loc[:,df.columns.isin(columns_info)]
        
        return df
    
    except:
        print(f'{month:02d}', year, 'information Not available')
        



In [7]:
def load_table(year, month):
    """
    Function that reads the downloaded data and converts it to a DataFrame
    """
    if year==2019:
        if month==6:
            path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_NouBicing_ESTACIONS.csv'
        elif month<7:
            path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_BICING2_STAT.csv'
        else:
            path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_BicingNou_ESTACIONS.csv'
    else:
        path = f'data/{year}/{month:02d}/{year}_{month:02d}_{i2m[month-1][1]}_BicingNou_ESTACIONS.csv'
        
    try:
        if not os.path.exists(path):
            z = py7zr.SevenZipFile(f'data/{year}/{month:02d}/{month:02d}.7z', 'r')
            z.extractall(path=f'data/{year}/{month:02d}/')
    except:
        print(f'The name of the file is {year}_{month:02d}_{i2m[month-1][1]}_BicingNou_ESTACIONS.csv')
        return []
        
        
    try:
        if month==3:
            if year==2019:
                os.rename(f'data/{year}/{month:02d}/{year}_{month:02d}_Març_BICING2_STAT.csv',path)
            else:
                os.rename(f'data/{year}/{month:02d}/{year}_{month:02d}_Març_BicingNou_ESTACIONS.csv',path)
    except:
        print(f'The name of the file is with {i2m[month-1][1]}')
    
    try:
        df = pd.read_csv(path)
        return df
    except:
        print(f'{month:02d}', year, 'dataset Not available')



In [8]:
def clean_data(data, year, month, df_name , sampling = 10000):
    """
    Function that clears the month data.
    """
    if len(data)>0:
        data = data[::sampling]
        data["last_updated"] = [datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S') for t in data.last_updated]
        data["day"]=[datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S').day for t in data.last_updated]
        data["month"]=[datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S').month for t in data.last_updated]
        data["year"]=[datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S').year for t in data.last_updated]
        if df_name!="info":
            data["last_reported"] = [datetime.datetime.fromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S') for t in data.last_reported]

        return data
    return;

In [9]:
data_info = pd.concat([clean_data(load_information(year, month),year,month,"info") for year in tqdm(YEARS) for month in tqdm(range(1, 13), leave = False)])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

No such file or directory:  2022_01_Gener_BicingNou_INFORMACIO.csv
No such file or directory:  2022_02_Febrer_BicingNou_INFORMACIO.csv
No such file or directory:  2022_03_Marc_BicingNou_INFORMACIO.csv
No such file or directory:  2022_04_Abril_BicingNou_INFORMACIO.csv
No such file or directory:  2022_05_Maig_BicingNou_INFORMACIO.csv
No such file or directory:  2022_06_Juny_BicingNou_INFORMACIO.csv
No such file or directory:  2022_07_Juliol_BicingNou_INFORMACIO.csv
No such file or directory:  2022_08_Agost_BicingNou_INFORMACIO.csv
No such file or directory:  2022_09_Setembre_BicingNou_INFORMACIO.csv
No such file or directory:  2022_10_Octubre_BicingNou_INFORMACIO.csv
No such file or directory:  2022_11_Novembre_BicingNou_INFORMACIO.csv
No such file or directory:  2022_12_Desembre_BicingNou_INFORMACIO.csv


  0%|          | 0/12 [00:00<?, ?it/s]

No such file or directory:  2021_01_Gener_BicingNou_INFORMACIO.csv
No such file or directory:  2021_02_Febrer_BicingNou_INFORMACIO.csv
No such file or directory:  2021_03_Marc_BicingNou_INFORMACIO.csv
No such file or directory:  2021_04_Abril_BicingNou_INFORMACIO.csv
No such file or directory:  2021_05_Maig_BicingNou_INFORMACIO.csv
No such file or directory:  2021_06_Juny_BicingNou_INFORMACIO.csv
No such file or directory:  2021_07_Juliol_BicingNou_INFORMACIO.csv
No such file or directory:  2021_08_Agost_BicingNou_INFORMACIO.csv
No such file or directory:  2021_09_Setembre_BicingNou_INFORMACIO.csv
No such file or directory:  2021_10_Octubre_BicingNou_INFORMACIO.csv
No such file or directory:  2021_11_Novembre_BicingNou_INFORMACIO.csv
No such file or directory:  2021_12_Desembre_BicingNou_INFORMACIO.csv


  0%|          | 0/12 [00:00<?, ?it/s]

No such file or directory:  2020_01_Gener_BicingNou_INFORMACIO.csv
No such file or directory:  2020_02_Febrer_BicingNou_INFORMACIO.csv
No such file or directory:  2020_03_Marc_BicingNou_INFORMACIO.csv
No such file or directory:  2020_04_Abril_BicingNou_INFORMACIO.csv
No such file or directory:  2020_05_Maig_BicingNou_INFORMACIO.csv
No such file or directory:  2020_06_Juny_BicingNou_INFORMACIO.csv
No such file or directory:  2020_07_Juliol_BicingNou_INFORMACIO.csv
No such file or directory:  2020_08_Agost_BicingNou_INFORMACIO.csv
No such file or directory:  2020_09_Setembre_BicingNou_INFORMACIO.csv
No such file or directory:  2020_10_Octubre_BicingNou_INFORMACIO.csv
No such file or directory:  2020_11_Novembre_BicingNou_INFORMACIO.csv
No such file or directory:  2020_12_Desembre_BicingNou_INFORMACIO.csv


  0%|          | 0/12 [00:00<?, ?it/s]

No such file or directory:  2019_01_Gener_BicingNou_INFORMACIO.csv
No such file or directory:  2019_02_Febrer_BicingNou_INFORMACIO.csv
No such file or directory:  2019_03_Marc_BicingNou_INFORMACIO.csv
No such file or directory:  2019_04_Abril_BicingNou_INFORMACIO.csv
No such file or directory:  2019_05_Maig_BicingNou_INFORMACIO.csv
No such file or directory:  2019_06_Juny_BicingNou_INFORMACIO.csv
No such file or directory:  2019_07_Juliol_BicingNou_INFORMACIO.csv
No such file or directory:  2019_08_Agost_BicingNou_INFORMACIO.csv
No such file or directory:  2019_09_Setembre_BicingNou_INFORMACIO.csv
No such file or directory:  2019_10_Octubre_BicingNou_INFORMACIO.csv
No such file or directory:  2019_11_Novembre_BicingNou_INFORMACIO.csv
No such file or directory:  2019_12_Desembre_BicingNou_INFORMACIO.csv


ValueError: All objects passed were None

In [10]:
data = pd.concat([clean_data(load_table(year, month),year,month,"data") for year in tqdm(YEARS) for month in tqdm(range(1, 13), leave = False)])

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

The name of the file is 2022_01_Gener_BicingNou_ESTACIONS.csv
The name of the file is 2022_02_Febrer_BicingNou_ESTACIONS.csv
The name of the file is 2022_03_Marc_BicingNou_ESTACIONS.csv
The name of the file is 2022_04_Abril_BicingNou_ESTACIONS.csv
The name of the file is 2022_05_Maig_BicingNou_ESTACIONS.csv
The name of the file is 2022_06_Juny_BicingNou_ESTACIONS.csv
The name of the file is 2022_07_Juliol_BicingNou_ESTACIONS.csv
The name of the file is 2022_08_Agost_BicingNou_ESTACIONS.csv
The name of the file is 2022_09_Setembre_BicingNou_ESTACIONS.csv
The name of the file is 2022_10_Octubre_BicingNou_ESTACIONS.csv
The name of the file is 2022_11_Novembre_BicingNou_ESTACIONS.csv
The name of the file is 2022_12_Desembre_BicingNou_ESTACIONS.csv


  0%|          | 0/12 [00:00<?, ?it/s]

The name of the file is 2021_01_Gener_BicingNou_ESTACIONS.csv
The name of the file is 2021_02_Febrer_BicingNou_ESTACIONS.csv
The name of the file is 2021_03_Marc_BicingNou_ESTACIONS.csv
The name of the file is 2021_04_Abril_BicingNou_ESTACIONS.csv
The name of the file is 2021_05_Maig_BicingNou_ESTACIONS.csv
The name of the file is 2021_06_Juny_BicingNou_ESTACIONS.csv
The name of the file is 2021_07_Juliol_BicingNou_ESTACIONS.csv
The name of the file is 2021_08_Agost_BicingNou_ESTACIONS.csv
The name of the file is 2021_09_Setembre_BicingNou_ESTACIONS.csv
The name of the file is 2021_10_Octubre_BicingNou_ESTACIONS.csv
The name of the file is 2021_11_Novembre_BicingNou_ESTACIONS.csv
The name of the file is 2021_12_Desembre_BicingNou_ESTACIONS.csv


  0%|          | 0/12 [00:00<?, ?it/s]

The name of the file is 2020_01_Gener_BicingNou_ESTACIONS.csv
The name of the file is 2020_02_Febrer_BicingNou_ESTACIONS.csv
The name of the file is 2020_03_Marc_BicingNou_ESTACIONS.csv
The name of the file is 2020_04_Abril_BicingNou_ESTACIONS.csv
The name of the file is 2020_05_Maig_BicingNou_ESTACIONS.csv
The name of the file is 2020_06_Juny_BicingNou_ESTACIONS.csv
The name of the file is 2020_07_Juliol_BicingNou_ESTACIONS.csv
The name of the file is 2020_08_Agost_BicingNou_ESTACIONS.csv
The name of the file is 2020_09_Setembre_BicingNou_ESTACIONS.csv
The name of the file is 2020_10_Octubre_BicingNou_ESTACIONS.csv
The name of the file is 2020_11_Novembre_BicingNou_ESTACIONS.csv
The name of the file is 2020_12_Desembre_BicingNou_ESTACIONS.csv


  0%|          | 0/12 [00:00<?, ?it/s]

The name of the file is 2019_01_Gener_BicingNou_ESTACIONS.csv
The name of the file is 2019_02_Febrer_BicingNou_ESTACIONS.csv
The name of the file is 2019_03_Marc_BicingNou_ESTACIONS.csv
The name of the file is 2019_04_Abril_BicingNou_ESTACIONS.csv
The name of the file is 2019_05_Maig_BicingNou_ESTACIONS.csv
The name of the file is 2019_06_Juny_BicingNou_ESTACIONS.csv
The name of the file is 2019_07_Juliol_BicingNou_ESTACIONS.csv
The name of the file is 2019_08_Agost_BicingNou_ESTACIONS.csv
The name of the file is 2019_09_Setembre_BicingNou_ESTACIONS.csv
The name of the file is 2019_10_Octubre_BicingNou_ESTACIONS.csv
The name of the file is 2019_11_Novembre_BicingNou_ESTACIONS.csv
The name of the file is 2019_12_Desembre_BicingNou_ESTACIONS.csv


ValueError: All objects passed were None

In [11]:
data.isna().sum()

NameError: name 'data' is not defined

In [12]:
data_info.isna().sum()

NameError: name 'data_info' is not defined

In [13]:
# data.groupby(["year","month"]).count()
data.head()

NameError: name 'data' is not defined

In [104]:
# data_info.groupby(["year","month"]).count()
data_info.head()

Unnamed: 0,station_id,name,physical_configuration,lat,lon,altitude,address,post_code,capacity,nearby_distance,cross_street,last_updated,ttl,day,month,year,is_charging_station
0,1.0,"GRAN VIA CORTS CATALANES, 760",ELECTRICBIKESTATION,41.397978,2.180107,16.0,"GRAN VIA CORTS CATALANES, 760",8013.0,46.0,1000.0,,2021-12-31 23:59:55,18.0,31,12,2021,
10000,357.0,"C/ CARDENER, 59",ELECTRICBIKESTATION,41.410596,2.15799,87.0,"C/ CARDENER, 59",8024.0,27.0,1000.0,,2022-01-01 01:34:42,6.0,1,1,2022,
20000,197.0,"C/ GELABERT, 1",ELECTRICBIKESTATION,41.387158,2.141094,47.0,"C/ GELABERT, 1",8029.0,27.0,1000.0,,2022-01-01 03:15:10,30.0,1,1,2022,
30000,30.0,"AV. DIAGONAL, 231 AMB PADILLA",ELECTRICBIKESTATION,41.402181,2.182989,13.0,"AV. DIAGONAL, 231 AMB PADILLA",8013.0,29.0,1000.0,,2022-01-01 04:54:54,18.0,1,1,2022,
40000,385.0,"C/ CASANOVA, 119",ELECTRICBIKESTATION,41.387888,2.15529,30.0,"C/ CASANOVA, 119",8036.0,30.0,1000.0,,2022-01-01 06:29:37,1.0,1,1,2022,
