In [1]:
import requests

In [2]:
# This gives information for each station that remains stable over time
url_stations = 'https://gbfs.citibikenyc.com/gbfs/en/station_information.json'

# This gives the live status of all the stations (e.g., bikes available etc)
url_status = 'https://gbfs.citibikenyc.com/gbfs/en/station_status.json'

In [3]:
# We fetch for now just the time-invariant data
results = requests.get(url_stations).json() 

In [4]:
# We only need a subset of the data in the JSON returned by the Citibike API, so we keep only what we need
data = results["data"]["stations"]

In [5]:
# We will not be using dataframes for this insertion task. (See the G2 notebook if you want to use Pandas)
# We just put the data in a dataframe to understand what is going on.
import pandas as pd
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,legacy_id,region_id,name,has_kiosk,station_type,capacity,rental_url,external_id,eightd_has_key_dispenser,station_id,lat,eightd_station_services,lon,short_name,rental_methods,electric_bike_surcharge_waiver
0,72,71,W 52 St & 11 Ave,True,classic,55,http://app.citibikenyc.com/S6Lr/IBV092JufD?sta...,66db237e-0aca-11e7-82f6-3863bb44ef7c,False,72,40.767272,[],-73.993929,6926.01,"[KEY, CREDITCARD]",False
1,79,71,Franklin St & W Broadway,True,classic,33,http://app.citibikenyc.com/S6Lr/IBV092JufD?sta...,66db269c-0aca-11e7-82f6-3863bb44ef7c,False,79,40.719116,[],-74.006667,5430.08,"[KEY, CREDITCARD]",False
2,82,71,St James Pl & Pearl St,True,classic,27,http://app.citibikenyc.com/S6Lr/IBV092JufD?sta...,66db277a-0aca-11e7-82f6-3863bb44ef7c,False,82,40.711174,[],-74.000165,5167.06,"[KEY, CREDITCARD]",False
3,83,71,Atlantic Ave & Fort Greene Pl,True,classic,62,http://app.citibikenyc.com/S6Lr/IBV092JufD?sta...,66db281e-0aca-11e7-82f6-3863bb44ef7c,False,83,40.683826,[],-73.976323,4354.07,"[KEY, CREDITCARD]",False
4,116,71,W 17 St & 8 Ave,True,classic,50,http://app.citibikenyc.com/S6Lr/IBV092JufD?sta...,66db28b5-0aca-11e7-82f6-3863bb44ef7c,False,116,40.741776,[],-74.001497,6148.02,"[KEY, CREDITCARD]",False


In [12]:
!pip3 install mysql-connector-python
!apt install python3-mysqldb

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  default-mysql-server | virtual-mysql-server python-egenix-mxdatetime
  python3-mysqldb-dbg
The following NEW packages will be installed:
  python3-mysqldb
0 upgraded, 1 newly installed, 0 to remove and 16 not upgraded.
Need to get 46.0 kB of archives.
After this operation, 183 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 python3-mysqldb amd64 1.3.10-1build1 [46.0 kB]
Fetched 46.0 kB in 0s (660 kB/s)
Selecting previously unselected package python3-mysqldb.
(Reading database ... 145483 files and directories currently installed.)
Preparing to unpack .../python3-mysqldb_1.3.10-1build1_amd64.deb ...
Unpacking python3-mysqldb (1.3.10-1build1) ...
Setting up python3-mysqldb (1.3.10-1build1) ...


In [15]:
# Now, let's connect to our database, where we will store our data.

from sqlalchemy import create_engine
#
# Note, the user can use their own userid and database
# i.e DealinF19GBx   where x is your team number...

conn_string = 'mysql://{user}:{password}@{host}/'.format(
    host ='bigdata.stern.nyu.edu', 
    user = 'DealingS21',
    password = 'DealingS21!!!!')

engine = create_engine(conn_string)
con = engine.connect()

OperationalError: ignored

In [None]:
# This will be the name of our database.
db_name = 'citibike'

In [None]:
# Just bookkeeping. Drop the database if it is already there
create_db_query = "DROP DATABASE IF EXISTS {db}".format(db=db_name)
engine.execute(create_db_query)

In [None]:
# Run a query to create a database that will hold the data
create_db_query = "CREATE DATABASE IF NOT EXISTS {db} DEFAULT CHARACTER SET 'utf8'".format(db=db_name)

# Create a database
engine.execute(create_db_query)

In [None]:
# Create the two tables. One for storing the time-invariant station data
# and another table to store the time-varying station status data
table_name = 'Stations'
create_table_query = '''CREATE TABLE IF NOT EXISTS {db}.{table} 
                                (station_id int, 
                                name varchar(250), 
                                capacity int,
                                lat float,
                                lon float,
                                region_id int,
                                short_name varchar(250),
                                rental_url varchar(250),
                                eightd_has_key_dispenser bool,
                                PRIMARY KEY(station_id)
                                )'''.format(db=db_name, table=table_name)
engine.execute(create_table_query)

In [None]:
results = requests.get(url_stations).json() 
data = results["data"]["stations"]
data[0]

In [None]:
# We fetch for now just the time-invariant data
# Notice that we have the INSERT IGNORE so that even when we add the same entry
# again, we do not get an error that the line exists. We do get warnings
# but this is expected

table_name = 'Stations'
query_template = '''INSERT IGNORE INTO {db}.{table}(station_id, 
                                        name,
                                        capacity, 
                                        lat, 
                                        lon,
                                        region_id,
                                        short_name,
                                        rental_url,
                                        eightd_has_key_dispenser) 
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)'''.format(db=db_name, table=table_name)

for entry in data:
    station_id = int(entry['station_id'])
    name = entry['name']
    capacity = entry['capacity']
    lat = entry['lat']
    lon = entry['lon']
    region_id = entry.get('region_id')
    short_name = entry['short_name']
    rental_url = entry['rental_url']
    eightd_has_key_dispenser = entry['eightd_has_key_dispenser']
                       
    print("Inserting station", station_id, "at", name)
    query_parameters = (station_id, name, capacity, lat, lon,
                        region_id, short_name, rental_url, eightd_has_key_dispenser)
    engine.execute(query_template, query_parameters)


In [None]:
check = pd.read_sql("SELECT * FROM citibike.Stations", con=engine)
check

### plot the location of the stations, use the lon and lat and circles of size 10

In [None]:
%matplotlib inline
check.plot(kind='scatter', x='lon', y='lat', s=10, figsize=(10,10))

In [None]:
# Create the time-varying table
table_name = 'Status'
create_table_query = '''CREATE TABLE IF NOT EXISTS {db}.{table} 
                                (station_id int, 
                                last_reported datetime,
                                num_bikes_available int,
                                num_bikes_disabled int,
                                num_docks_available int,
                                num_docks_disabled int,
                                is_installed bool,
                                is_renting bool,
                                is_returning bool,
                                eightd_has_available_keys bool,
                                PRIMARY KEY(station_id, last_reported)
                                )'''.format(db=db_name, table=table_name)
engine.execute(create_table_query)

In [None]:
results = requests.get(url_status).json() 
data = results["data"]["stations"]
data[0]

In [None]:
# Now we fetch the data about the time varying elements of the citibike stations
from datetime import datetime

table_name = 'Status'
query_template = '''INSERT IGNORE INTO {db}.{table}(station_id, 
                                            num_bikes_available,
                                            num_bikes_disabled,
                                            num_docks_available,
                                            num_docks_disabled,
                                            is_installed,
                                            is_renting,
                                            is_returning,
                                            last_reported,
                                            eightd_has_available_keys) 
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'''.format(db=db_name, table=table_name)

for entry in data:
    print(entry)
    station_id = int(entry['station_id'])
    num_bikes_available = entry['num_bikes_available']
    num_bikes_disabled = entry['num_bikes_disabled']
    num_docks_available = entry['num_docks_available']
    num_docks_disabled = entry['num_docks_disabled']
    is_installed = entry['is_installed']
    is_renting = entry['is_renting']
    is_returning = entry['is_returning']
    last_reported = datetime.fromtimestamp(entry['last_reported']) 
    eightd_has_available_keys = entry['eightd_has_available_keys']
                       
    print("Inserting station", station_id)
    query_parameters = (station_id, num_bikes_available, num_bikes_disabled,
                        num_docks_available, num_docks_disabled, is_installed, is_renting,
                        is_returning, last_reported, eightd_has_available_keys)
    engine.execute(query_template, query_parameters)



In [None]:
check = pd.read_sql("SELECT * FROM citibike.Status", con=engine)
check


In [None]:
con.close()