# CityBikes

***Data Source:  [CityBikes](https://citybik.es/) API***

In [2]:
#libraries
import pandas as pd
import requests 
import numpy as np


***Importing and Exploring the data***

In [112]:
# Query the CityBikes API to get the data about the networks beacuse network is the top level of the data having information about the bikes
def fetch_city_bikes():
    try:
        url="http://api.citybik.es/v2/networks"
        headers = {"accept": "application/json"}
        response=requests.get(url,headers=headers)
        response.raise_for_status() #
    except requests.exceptions.HTTPError as errh:
        print("HTTP Error")
        print(errh.args[0])
    else:
        return response
      

In [113]:
res=fetch_city_bikes()
data=res.json() #Convert the response to json 


In [114]:
#Normalize semi-structured JSON data into a flat table.
networks=pd.json_normalize(data['networks']) # networks is a top node of the json containing data of our interest

In [115]:
networks.head()

Unnamed: 0,company,href,id,name,location.city,location.country,location.latitude,location.longitude,source,gbfs_href,license.name,license.url,ebikes
0,[ЗАО «СитиБайк»],/v2/networks/velobike-moscow,velobike-moscow,Velobike,Moscow,RU,55.75,37.616667,,,,,
1,[Urban Infrastructure Partner],/v2/networks/baerum-bysykkel,baerum-bysykkel,Bysykkel,Bærum,NO,59.89455,10.546343,,,,,
2,[Comunicare S.r.l.],/v2/networks/bicincitta-siena,bicincitta-siena,Bicincittà,Siena,IT,43.3186,11.3306,https://www.bicincitta.com/frmLeStazioni.aspx?...,,,,
3,[Cyclopolis Systems],/v2/networks/cyclopolis-maroussi,cyclopolis-maroussi,Cyclopolis,Maroussi,GR,38.056872,23.80833,,,,,
4,[Cyclopolis Systems],/v2/networks/cyclopolis-nafplio,cyclopolis-nafplio,Cyclopolis,Nafplio,GR,37.56394,22.80934,,,,,


In [116]:
# Save the networks raw data in csv file so that 
# we no need to requery the API incase we want to refresh the our dataframe 
networks.to_csv('../data/raw_networks.csv', index=False)

In [148]:
# we can get our dataframe from the csv file if we need at any time 
#networks=pd.read_csv('../data/raw_networks.csv')

***Lets explore the networks dataframe***

In [149]:
networks.shape # the output shows dataframe have 590 rows and 13 columns

(590, 13)

In [150]:
networks.info() #summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   company             579 non-null    object 
 1   href                590 non-null    object 
 2   id                  590 non-null    object 
 3   name                590 non-null    object 
 4   location.city       590 non-null    object 
 5   location.country    590 non-null    object 
 6   location.latitude   590 non-null    float64
 7   location.longitude  590 non-null    float64
 8   source              158 non-null    object 
 9   gbfs_href           87 non-null     object 
 10  license.name        25 non-null     object 
 11  license.url         25 non-null     object 
 12  ebikes              25 non-null     object 
dtypes: float64(2), object(11)
memory usage: 60.0+ KB


In [151]:
networks.isnull().sum() # it provide the brief summary of the total no of null values in each column of dataframe

company                11
href                    0
id                      0
name                    0
location.city           0
location.country        0
location.latitude       0
location.longitude      0
source                432
gbfs_href             503
license.name          565
license.url           565
ebikes                565
dtype: int64

In [152]:
networks_df=networks.copy() # we can copy a DataFrame without affecting the original(deep copy)

***Note:*** The above functions provide the basic information (like no of rows,columns,column name list, datatype and null value count of each column) about the dataframe networks.

In [13]:
#Lets do some data cleaning

In [153]:
networks_df.columns # columns list

Index(['company', 'href', 'id', 'name', 'location.city', 'location.country',
       'location.latitude', 'location.longitude', 'source', 'gbfs_href',
       'license.name', 'license.url', 'ebikes'],
      dtype='object')

In [154]:
# we will remove the columns which are out of scope of our poits of interest
#inplace will change the dataframe itself
# axis=1 used for columns. we will use these attributes throughout this project 
networks_df.drop(['location.latitude', 'location.longitude','href','company','source', 'gbfs_href','license.name', 'license.url'],axis=1,inplace=True)


In [155]:
networks_df.columns # columns list

Index(['id', 'name', 'location.city', 'location.country', 'ebikes'], dtype='object')

In [124]:
# ebikes column cleaning

In [156]:
networks_df['ebikes'].value_counts(dropna=False)# count the distict values and including NaN values

NaN     565
True     25
Name: ebikes, dtype: int64

***Note:*** Here we assume missing values as 'False' for 'ebikes' column

In [157]:
networks_df['ebikes'] = networks_df['ebikes'].fillna(False) #fill the NaN in ebikes column as opposite to True i.e. 'False'

In [158]:
networks_df['ebikes'].value_counts(dropna=False) # Now it looks more significant

False    565
True      25
Name: ebikes, dtype: int64

In [159]:
networks_df['ebikes'].astype('bool') # Change the datatype of the column

0      False
1      False
2      False
3      False
4      False
       ...  
585    False
586    False
587    False
588    False
589    False
Name: ebikes, Length: 590, dtype: bool

In [129]:
networks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  590 non-null    object 
 1   name                590 non-null    object 
 2   location.city       590 non-null    object 
 3   location.country    590 non-null    object 
 4   location.latitude   590 non-null    float64
 5   location.longitude  590 non-null    float64
 6   ebikes              590 non-null    bool   
dtypes: bool(1), float64(2), object(4)
memory usage: 28.4+ KB


In [130]:
# end of ebikes column cleaning

In [160]:
#Again check for null values
networks_df.isnull().sum()


id                  0
name                0
location.city       0
location.country    0
ebikes              0
dtype: int64

***Note:***  So Now out dataframe does not contains any null values

In [161]:
# Lets check for duplicate rows
networks_df.duplicated().value_counts()

False    590
dtype: int64

***Note:*** So no duplicate records found as no of duplicate rows is represented as 'True' value count

***Note:*** Rename the column to give more decent names

In [162]:
networks_df.rename(columns={"id":"network_id",
                            "name":"network_name",
                            "location.city": "city",
                            "location.country": "country"},inplace=True)

In [163]:
networks_df.head()

Unnamed: 0,network_id,network_name,city,country,ebikes
0,velobike-moscow,Velobike,Moscow,RU,False
1,baerum-bysykkel,Bysykkel,Bærum,NO,False
2,bicincitta-siena,Bicincittà,Siena,IT,False
3,cyclopolis-maroussi,Cyclopolis,Maroussi,GR,False
4,cyclopolis-nafplio,Cyclopolis,Nafplio,GR,False


***Note:***
- At this stage we consider our dataframe cleaned.
- Save the cleaned Dataframe into csv file

In [164]:
networks_df.to_csv('../data/networks.csv', index=False)

***Note:*** Just a quick check for the backup

In [165]:
df=pd.read_csv('../data/networks.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   network_id    590 non-null    object
 1   network_name  590 non-null    object
 2   city          590 non-null    object
 3   country       590 non-null    object
 4   ebikes        590 non-null    bool  
dtypes: bool(1), object(4)
memory usage: 19.1+ KB


***Note:*** It seems perfect

In [137]:
#networks_df=df.copy()

***Send a request to CityBikes for the city of your choice.***

In [138]:
def fetch_stations(network_id,stations):
    try:
        url=f"http://api.citybik.es/v2/networks/{network_id}?fields=stations,id"
        headers = {"accept": "application/json"}
        response=requests.get(url,headers=headers)
        response.raise_for_status()
        data_temp=response.json()
        st=pd.json_normalize(data_temp['network']['stations']) #parse the json at stations node
        st['network_id']=data_temp['network']['id'] #find the network id and add this to the stations dataframe
        stations=pd.concat([stations,st], ignore_index=True)     
    except requests.exceptions.HTTPError as errh:
        print("HTTP Error")
        print(errh.args[0])
    else:
        print("Process Completed for network ", network_id)
        return stations



In [139]:

def fetch_stations_of_city(city):
    city_networks=networks_df[networks_df['city'].str.contains(city.strip(),case=False)] 
     # str convert the DF column to sting as contains work only on str.
    # contains checks if 'Paris' exists in as a part of string in city column like it also picks record if city value is 'FR Paris' or something similar
    # case is used to go with case insensitive search
    stations=pd.DataFrame()
    for index, row in city_networks.iterrows():
        stations=fetch_stations(row['network_id'],stations)
    return stations
   

- The City of my choice is ***Paris***

In [166]:
city='Paris'
stations=fetch_stations_of_city(city)
stations.head()

Process Completed for network  velib
Process Completed for network  saclay-captainbike


Unnamed: 0,empty_slots,free_bikes,id,latitude,longitude,name,timestamp,extra.banking,extra.ebikes,extra.last_updated,extra.payment-terminal,extra.renting,extra.returning,extra.slots,extra.station_id,extra.uid,extra.payment,network_id,extra.address
0,34,1,78ec9186acd18a0b30bd3156d24b9f8d,48.865983,2.275725,Benjamin Godard - Victor Hugo,2023-09-27T05:19:41.321000Z,False,0.0,1695790722,False,1,1,35,213688169.0,16107,,velib,
1,51,2,43c856353b954711f2bbee185a1f9d04,48.853756,2.339096,André Mazet - Saint-André des Arts,2023-09-27T05:19:41.255000Z,True,0.0,1695790846,True,1,1,55,99950133.0,6015,[creditcard],velib,
2,32,5,9b2700f3b19beb1dd9df2fd6a013789f,48.867872,2.364898,Faubourg Du Temple - Republique,2023-09-27T05:19:38.914000Z,True,1.0,1695791070,True,1,1,38,312165511.0,11037,[creditcard],velib,
3,19,1,80778031e1a033712bfc3a27a190d5dd,48.856452,2.334852,Beaux-Arts - Bonaparte,2023-09-27T05:19:39.546000Z,True,1.0,1695790987,True,1,1,20,210405211.0,6021,[creditcard],velib,
4,21,0,f8f9ec30d28856f9b7fad103be9e4674,48.879296,2.33736,Toudouze - Clauzel,2023-09-27T05:19:39.655000Z,True,0.0,1695790999,True,1,1,21,36255.0,9020,[creditcard],velib,


***Note:*** save this data into csv file for backup

In [167]:
stations.to_csv('../data/raw_stations.csv', index=False)

***Parse through the response to get the details you want for the bike stations in that city (latitude, longitude, number of bikes).***

***Note:*** we will create a new DataFrame having only the fields of interest to make it simple and easy. Later on at any point if we find the requirement of any other column we can import it from the raw_stations backup

In [168]:
stations.columns #display the columns and fetch only the required ones

Index(['empty_slots', 'free_bikes', 'id', 'latitude', 'longitude', 'name',
       'timestamp', 'extra.banking', 'extra.ebikes', 'extra.last_updated',
       'extra.payment-terminal', 'extra.renting', 'extra.returning',
       'extra.slots', 'extra.station_id', 'extra.uid', 'extra.payment',
       'network_id', 'extra.address'],
      dtype='object')

In [174]:
stations_df=stations[['id','name','latitude', 'longitude', 'free_bikes','network_id']].copy()
stations_df.head()

Unnamed: 0,id,name,latitude,longitude,free_bikes,network_id
0,78ec9186acd18a0b30bd3156d24b9f8d,Benjamin Godard - Victor Hugo,48.865983,2.275725,1,velib
1,43c856353b954711f2bbee185a1f9d04,André Mazet - Saint-André des Arts,48.853756,2.339096,2,velib
2,9b2700f3b19beb1dd9df2fd6a013789f,Faubourg Du Temple - Republique,48.867872,2.364898,5,velib
3,80778031e1a033712bfc3a27a190d5dd,Beaux-Arts - Bonaparte,48.856452,2.334852,1,velib
4,f8f9ec30d28856f9b7fad103be9e4674,Toudouze - Clauzel,48.879296,2.33736,0,velib


***Put your parsed results into a DataFrame.***

In [144]:
# first lets do some exploration and cleaning

In [175]:
stations_df.rename(columns={"id": "station_id","name":"station", "free_bikes": "no_of_bikes"},inplace=True)

In [176]:
stations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   station_id   1459 non-null   object 
 1   station      1459 non-null   object 
 2   latitude     1459 non-null   float64
 3   longitude    1459 non-null   float64
 4   no_of_bikes  1459 non-null   int64  
 5   network_id   1459 non-null   object 
dtypes: float64(2), int64(1), object(3)
memory usage: 68.5+ KB


***Note:*** The above data shows there is no null or missing values. For now we consider it clean and save the data in csv for future use.

In [177]:
stations_df.to_csv('../data/paris_stations.csv', index=False) #data about only city named Paris