# Data Collection

For this project I will use data from several different sources.

### Import Libraries and Define Functions

In [84]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import requests

### Map and Background Data

I use the official neighborhood boundaries as determined by a city survey which can be found here: 

## Indego Data

Indego provides access to their data at: https://www.rideindego.com/about/data/ The Indego Data I am interested in are:
 - Trip Data, provided quarterly since Q2 2015 and up until Q2 2020
 - A json API with realtime station status information

The first will form the backbone of the observations and I will aggregate it to compute the traffic flows from neighborhoods. The latter will provide some basic variables to be joined to the aggregate such as location and number of docks available.

### Trip Data
The data must be downloaded by quarter, and so must be merged before it can be used. Currently there are three full years of data. I will not use partial years so as to keep my observations balanced across the year. 

In [2]:
df1 = pd.read_csv('data/indego/indego-trips-2016-q1.csv')

In [3]:
df2 = pd.read_csv('data/indego/indego-trips-2016-q2.csv')

In [4]:
df3 = pd.read_csv('data/indego/indego-trips-2016-q3.csv')

In [5]:
df4 = pd.read_csv('data/indego/indego-trips-2016-q4.csv')

In [6]:
df5 = pd.read_csv('data/indego/indego-trips-2017-q1.csv')

In [7]:
df6 = pd.read_csv('data/indego/indego-trips-2017-q2.csv')

In [8]:
df7 = pd.read_csv('data/indego/indego-trips-2017-q3.csv')

In [9]:
df8 = pd.read_csv('data/indego/indego-trips-2017-q4.csv')

In [10]:
df9 = pd.read_csv('data/indego/indego-trips-2018-q1.csv')

In [11]:
df10 = pd.read_csv('data/indego/indego-trips-2018-q2.csv')

In [12]:
df11 = pd.read_csv('data/indego/indego-trips-2018-q3.csv')

In [13]:
df12 = pd.read_csv('data/indego/indego-trips-2018-q4.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [14]:
df13 = pd.read_csv('data/indego/indego-trips-2019-q1.csv')

In [15]:
df14 = pd.read_csv('data/indego/indego-trips-2019-q2.csv')

In [16]:
df15 = pd.read_csv('data/indego/indego-trips-2019-q3.csv')

In [17]:
df16 = pd.read_csv('data/indego/indego-trips-2019-q4.csv')

In [21]:
trip_df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10
                , df11, df12, df13, df14, df15, df16], ignore_index=True)

In [22]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2860043 entries, 0 to 2860042
Data columns (total 17 columns):
 #   Column               Dtype  
---  ------               -----  
 0   trip_id              int64  
 1   duration             int64  
 2   start_time           object 
 3   end_time             object 
 4   start_station_id     float64
 5   start_lat            object 
 6   start_lon            object 
 7   end_station_id       float64
 8   end_lat              object 
 9   end_lon              object 
 10  bike_id              object 
 11  plan_duration        float64
 12  trip_route_category  object 
 13  passholder_type      object 
 14  start_station        float64
 15  end_station          float64
 16  bike_type            object 
dtypes: float64(5), int64(2), object(10)
memory usage: 370.9+ MB


### Station Data

I will make use of the the active variable to select only stations that are currently active for my model. While it is possible that historical data from stations could add predictive power for now I will discard them. 

In [45]:
stations = pd.read_csv(r'data/indego/indego-stations.csv', names=['id', 'name', 'open_on', 'active'], header=0)

In [46]:
stations.head()

Unnamed: 0,id,name,open_on,active
0,3004,Municipal Services Building Plaza,23-Apr-15,Active
1,3005,"Welcome Park, NPS",23-Apr-15,Active
2,3006,40th & Spruce,23-Apr-15,Active
3,3007,"11th & Pine, Kahn Park",23-Apr-15,Active
4,3008,Temple University Station,23-Apr-15,Active


In [47]:
stations.columns

Index(['id', 'name', 'open_on', 'active'], dtype='object')

In [37]:
# request the data
r = requests.get ('http://www.rideindego.com/stations/json/')
stations = pd.read_json(r.content)

In [39]:
# put the shapefile into a geopands GeoDataFrame
gdf = gpd.GeoDataFrame.from_features(stations['features'])
gdf = gdf.set_crs('EPSG:4326')

In [51]:
gdf.columns

Index(['geometry', 'id', 'name', 'coordinates', 'totalDocks', 'docksAvailable',
       'bikesAvailable', 'classicBikesAvailable', 'smartBikesAvailable',
       'electricBikesAvailable', 'rewardBikesAvailable',
       'rewardDocksAvailable', 'kioskStatus', 'kioskPublicStatus',
       'kioskConnectionStatus', 'kioskType', 'addressStreet', 'addressCity',
       'addressState', 'addressZipCode', 'bikes', 'closeTime', 'eventEnd',
       'eventStart', 'isEventBased', 'isVirtual', 'kioskId', 'notes',
       'openTime', 'publicText', 'timeZone', 'trikesAvailable', 'latitude',
       'longitude'],
      dtype='object')

In [56]:
# I am only interested in a few of the columns
cols = ['geometry', 'id', 'coordinates', 'totalDocks']

In [76]:
rich_stations = stations.join(gdf.loc[:, cols].set_index('id'), on='id')

In [77]:
rich_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   id           154 non-null    int64   
 1   name         154 non-null    object  
 2   open_on      154 non-null    object  
 3   active       154 non-null    object  
 4   geometry     142 non-null    geometry
 5   coordinates  142 non-null    object  
 6   totalDocks   142 non-null    float64 
dtypes: float64(1), geometry(1), int64(1), object(4)
memory usage: 8.5+ KB


In [78]:
rich_stations['open_on'] = pd.to_datetime(rich_stations['open_on'])

In [79]:
rich_stations = rich_stations[rich_stations['open_on'] < pd.to_datetime('2020-01-01')].copy()

rich_stations = rich_stations[rich_stations['active'] =='Active'].copy()

In [82]:
rich_stations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139 entries, 0 to 151
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           139 non-null    int64         
 1   name         139 non-null    object        
 2   open_on      139 non-null    datetime64[ns]
 3   active       139 non-null    object        
 4   geometry     139 non-null    geometry      
 5   coordinates  139 non-null    object        
 6   totalDocks   139 non-null    float64       
dtypes: datetime64[ns](1), float64(1), geometry(1), int64(1), object(3)
memory usage: 8.7+ KB


In [None]:
pd

# Weather Data

In [20]:
weather = pd.read_csv(r'data/weather/Philadelphia_Historical_Weather_Hourly.csv')