In [56]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import re
import requests
import folium
import branca.colormap as cm
import geopy
from tqdm import tqdm_notebook as tqdm
import json

from useful_functions import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)

## Collect the most recent station data from Divvy [JSON Feed](https://feeds.divvybikes.com/stations/stations.json)

In [20]:
# request url
res = requests.get('https://feeds.divvybikes.com/stations/stations.json')

In [21]:
# convert data into pandas dataframe
df_now = pd.DataFrame(res.json()['stationBeanList'])

## Check past station data


In [22]:
# Combine station data for preivous yearsdf
df_2013 = pd.read_csv('data/Divvy_Stations_2013.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'online date'])
df_2014_1 = pd.read_csv('data/Divvy_Stations_2014-Q1Q2.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'online date'])
df_2014_2 = pd.read_csv('data/Divvy_Stations_2014-Q3Q4.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'dateCreated'])
df_2015 = pd.read_csv('data/Divvy_Stations_2013.csv', usecols=['id','latitude', 'longitude', 'dpcapacity'])
df_2016_1 = pd.read_csv('data/Divvy_Stations_2016_Q1Q2.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'online_date'])
df_2016_2 = pd.read_csv('data/Divvy_Stations_2016_Q3.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'online_date'])
df_2016_3 = pd.read_csv('data/Divvy_Stations_2016_Q4.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'online_date'])
df_2017_1 = pd.read_csv('data/Divvy_Stations_2017_Q1Q2.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'online_date'])
df_2017_2 = pd.read_csv('data/Divvy_Stations_2017_Q3Q4.csv', usecols=['id','latitude', 'longitude', 'dpcapacity', 'online_date', 'city'])

In [23]:
# Merge data on station_id for further comparison
latest = pd.merge(df_2013, df_now, on='id', how='outer').fillna(0)

In [24]:
map_from = folium.Map(location = [41.90, -87.64], tiles="Stamen Toner", zoom_start = 10.5)
wo_group = folium.FeatureGroup(name="Past")
wn_group = folium.FeatureGroup(name="Recent")

for lat, long in zip(latest.latitude_x, latest.longitude_x):
    if lat == 0 or long == 0:
        continue
    folium.CircleMarker([lat, long], color="orange", fill=False, radius=5, weight=2, fill_opacity=1).add_to(wo_group)
    
for lat, long in zip(latest.latitude_y, latest.longitude_y):
    if lat == None or long == None:
        continue
    folium.CircleMarker([lat, long], color="blue", fill=True, radius=3, weight=2, fill_opacity=1).add_to(wn_group)
    
wo_group.add_to(map_from)
wn_group.add_to(map_from)

folium.LayerControl().add_to(map_from)
map_from

## Combine station data from different years
- Online date: use the most recent record
- geocode: average the longitude and latitude over all past/current records
- dpcapacity: track the most recent

In [25]:
# Combine data together
df_cmb = df_now[['id', 'totalDocks', 'longitude', 'latitude']].copy()

for i, df in enumerate([df_2013, df_2014_1, df_2014_2, df_2015, df_2016_1, df_2016_2, df_2016_3, df_2017_1, df_2017_2]):
    df = df.rename(columns={
        "longitude": "longitude_"+str(i), 
        "latitude": "latitude_"+str(i), 
        "dpcapacity": "dpcapacity_"+str(i)
    })
    for c in df.columns:
        if 'date' in c:
            df = df.rename(columns={c: 'online_date_'+str(i)})
    df_cmb = df_cmb.merge(df, on='id', how='outer')

In [26]:
# Get average geo codes
def _get_average_geo(row, geo_di='longitude'):
    lon = []
    if not np.isnan(row[geo_di]):
        lon.append(row[geo_di])
    for i in range(9):
        if not np.isnan(row[geo_di+'_'+str(i)]):
            lon.append(row[geo_di+'_'+str(i)])
    return np.mean(np.array(lon))

df_cmb['lon_ave'] = df_cmb.apply(lambda x: _get_average_geo(x, 'longitude'), axis=1)
df_cmb['lat_ave'] = df_cmb.apply(lambda x: _get_average_geo(x, 'latitude'), axis=1)

In [27]:
# Get capacity number
def _get_dp_stats(row, attr='max'):
    dp = []
    for i in range(9):
        if not np.isnan(row['dpcapacity_'+str(i)]):
            dp.append(row['dpcapacity_'+str(i)])
    if not np.isnan(row['totalDocks']):
        dp.append(row['totalDocks'])
    if dp == []:
        return 0
    return getattr(np, attr)(dp)

df_cmb['dp_max'] = df_cmb.apply(lambda x: _get_dp_stats(x, 'max'), axis=1)
df_cmb['dp_min'] = df_cmb.apply(lambda x: _get_dp_stats(x, 'min'), axis=1)

In [28]:
# Collect online date
def _get_online_date(row):
    for i in range(8, -1, -1):
        if 'online_date_'+str(i) not in row:
            continue
        if type(row['online_date_'+str(i)]) == str:
            return row['online_date_'+str(i)]
        if np.isnan(row['online_date_'+str(i)]):
            continue
    return 

df_cmb['online_date'] = df_cmb.apply(lambda x: _get_online_date(x), axis=1)

In [29]:
df_cmb.online_date.isnull().sum()

37

## Convert online date into month, day and year

In [30]:
def _get_online_time(string, tp):
    index_dict = {
        'month': 0,
        'day': 1,
        'year': 2
    }
    if string == None:
        return 

    return int(re.match(r'([0-9]+)/([0-9]+)/([0-9]+)(\s|$)', string).groups()[index_dict[tp]])

In [31]:
# parse online_month/day/year
df_cmb['online_month'] = df_cmb.apply(lambda x: _get_online_time(x.online_date, 'month'), axis=1)
df_cmb['online_day'] = df_cmb.apply(lambda x: _get_online_time(x.online_date, 'day'), axis=1)
df_cmb['online_year'] = df_cmb.apply(lambda x: _get_online_time(x.online_date, 'year'), axis=1)

## Compare to the most recent data (year 2018) to check the online date for new stations

In [32]:
%%time
# Load the trip data for year 2018 to check station online time
data_2018 = pd.read_feather(f'data/Divvy_data_2018.feather')

CPU times: user 1.17 s, sys: 400 ms, total: 1.57 s
Wall time: 1.33 s


In [33]:
%%time
# Collect online date from previous data
station_wo_online_date = np.array(df_cmb[df_cmb.online_date.isnull()].id)
for sid in station_wo_online_date:
    if (data_2018.from_station_id == sid).sum() == 0:
        continue
    else:
        df_cmb.loc[df_cmb.id == sid, 'online_month'] = data_2018[data_2018.from_station_id == 621].iloc[0].month
        df_cmb.loc[df_cmb.id == sid, 'online_day'] = data_2018[data_2018.from_station_id == 621].iloc[0].day
        df_cmb.loc[df_cmb.id == sid, 'online_year'] = data_2018[data_2018.from_station_id == 621].iloc[0].year

CPU times: user 5.59 s, sys: 76.5 ms, total: 5.67 s
Wall time: 2.9 s


## Cleaning up data

In [34]:
# Drop columns
for c in ['latitude', 'longitude', 'dpcapacity', 'online_date']:
    for i in range(9):
        if c+'_'+str(i) in df_cmb.columns:
            df_cmb.drop(c+'_'+str(i), axis=1, inplace=True)
df_cmb.drop(['totalDocks', 'longitude', 'latitude', 'online_date'], axis=1, inplace=True)

In [52]:
df_cmb.city.unique()

array(['Chicago', 'Chicago ', 'Evanston', nan, 'Oak Park'], dtype=object)

## Collect city data

In [48]:
from geopy.geocoders import Nominatim

In [49]:
geolocator = Nominatim(user_agent="specify_your_app_name_here")

In [54]:
lon_lst = np.array(df_cmb[df_cmb.city.isnull()].lon_ave)
lat_lst = np.array(df_cmb[df_cmb.city.isnull()].lat_ave)

#for i in tqdm(range(len(lon_lst))):
    #print(geolocator.reverse("{},{}".format(lat_lst[i], lon_lst[i])))

In [57]:
# Load zip geojson
with open('geo_data/Boundaries_ZIPCodes.geojson', 'r') as p:
    zipcoes = json.load(p)

In [68]:
from folium import GeoJson

check_map = folium.Map(location = [41.90, -87.64], zoom_start = 10.5)
group = folium.FeatureGroup(name="O")

for lat, long in zip(lat_lst, lon_lst):
    if lat == 0 or long == 0:
        continue
    folium.CircleMarker([lat, long], color="black", fillColor="orange", fill=True, radius=8, weight=2, fill_opacity=0.3).add_to(group)    
    
GeoJson(zipcoes).add_to(check_map)
group.add_to(check_map)


folium.LayerControl().add_to(check_map)
check_map

In [70]:
import shapely
# out = []
# for i in tqdm(range(618)):
#     lon, lat = df_cmb.iloc[i].lon_ave, df_cmb.iloc[i].lat_ave
#     out.append((i, geolocator.reverse("{},{}".format(lat, lon))))

ModuleNotFoundError: No module named 'shapely'

## Saving data into feather

In [18]:
# save
df_cmb = df_cmb.reset_index()
df_cmb.to_feather(f'data/final_station_data.feather')

In [19]:
df_cmb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 9 columns):
index           618 non-null int64
id              618 non-null int64
lon_ave         618 non-null float64
lat_ave         618 non-null float64
dp_max          618 non-null float64
dp_min          618 non-null float64
online_month    618 non-null float64
online_day      618 non-null float64
online_year     618 non-null float64
dtypes: float64(7), int64(2)
memory usage: 43.5 KB
