In [1]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()
# import seaborn as sns
# import matplotlib.pyplot as plt
import warnings
import re
import requests
# import folium
# import branca.colormap as cm
# import geopy
from tqdm import tqdm_notebook as tqdm
import json
import os
# import geojson
import datetime

warnings.filterwarnings('ignore')

## Load trip data

In [2]:
%%time
def _convert_to_dateobject(x):
    return datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")

if not os.path.exists('../data/raw_trip_datetime_2018_Q3.pk'):
    trip_df = pd.read_csv('../data/Divvy_Trips_2018_Q3.csv')
    trip_df['start_time_dtoj'] = trip_df.apply(lambda row: _convert_to_dateobject(row.start_time), axis=1)
    trip_df['end_time_dtoj'] = trip_df.apply(lambda row: _convert_to_dateobject(row.end_time), axis=1)
    
    trip_df.to_pickle('../data/raw_trip_datetime_2018_Q3.pk')
else:
    trip_df = pd.read_pickle('../data/raw_trip_datetime_2018_Q3.pk')

CPU times: user 708 ms, sys: 330 ms, total: 1.04 s
Wall time: 1.05 s


In [3]:
trip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1513570 entries, 0 to 1513569
Data columns (total 14 columns):
trip_id              1513570 non-null int64
start_time           1513570 non-null object
end_time             1513570 non-null object
bikeid               1513570 non-null int64
tripduration         1513570 non-null object
from_station_id      1513570 non-null int64
from_station_name    1513570 non-null object
to_station_id        1513570 non-null int64
to_station_name      1513570 non-null object
usertype             1513570 non-null object
gender               1218574 non-null object
birthyear            1221990 non-null float64
start_time_dtoj      1513570 non-null datetime64[ns]
end_time_dtoj        1513570 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(4), object(7)
memory usage: 161.7+ MB


## Load station info

In [4]:
%%time
# Load from preprocessed data
sd = pd.read_csv('../data/Divvy_Stations_2017_Q3Q4.csv')

CPU times: user 10.4 ms, sys: 7.04 ms, total: 17.4 ms
Wall time: 16.6 ms


In [5]:
sd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 585 entries, 0 to 584
Data columns (total 8 columns):
id             585 non-null int64
name           585 non-null object
city           585 non-null object
latitude       585 non-null float64
longitude      585 non-null float64
dpcapacity     585 non-null int64
online_date    585 non-null object
Unnamed: 7     0 non-null float64
dtypes: float64(3), int64(2), object(3)
memory usage: 36.6+ KB


In [120]:
sd.sort_values(by='dpcapacity', ascending=False).head(10)

Unnamed: 0,id,name,city,latitude,longitude,dpcapacity,online_date,Unnamed: 7
85,97,Field Museum,Chicago,41.865312,-87.617867,55,6/30/2013 13:25,
1,3,Shedd Aquarium,Chicago,41.867226,-87.615355,55,6/10/2013 10:44,
176,195,Columbus Dr & Randolph St,Chicago,41.884728,-87.619521,47,8/7/2013 14:11,
78,90,Millennium Park,Chicago,41.881032,-87.624084,47,6/26/2013 19:51,
31,35,Streeter Dr & Grand Ave,Chicago,41.892278,-87.612043,47,6/22/2013 21:12,
173,192,Canal St & Adams St,Chicago,41.879255,-87.639904,47,8/6/2013 13:27,
39,43,Michigan Ave & Washington St,Chicago,41.883893,-87.624649,43,6/25/2013 10:57,
45,49,Dearborn St & Monroe St,Chicago,41.88132,-87.629521,39,6/25/2013 11:21,
234,255,Indiana Ave & Roosevelt Rd,Chicago,41.867888,-87.623041,39,8/31/2013 10:38,
318,341,Adler Planetarium,Chicago,41.866095,-87.607267,39,10/3/2013 12:01,


In [7]:
sd.dpcapacity.sum()/581

17.583476764199656

## Get sorted stations based on demand for a day

In [8]:
# Get the heavy demand station list for this day
def _get_hd_stn_lst(df, dd=2, mm=7):
    top_station_df = df[
        (df.start_time_dtoj.dt.day == dd) &
        (df.start_time_dtoj.dt.month == mm) 
    ].groupby(['from_station_id'])[['trip_id']]\
     .count().sort_values(by='trip_id', ascending=False)\
     .reset_index()
    
    return top_station_df

In [9]:
top_station_df = _get_hd_stn_lst(trip_df, 25, 9)

## Collect count and sum of tripdurations for each station (*time consuming*)

In [121]:
def _convert_to_float(row):
    if isinstance(row.tripduration, str):
        return float(row.tripduration.strip().replace(',',''))
    else:
        return 0.0
    
def _get_stats(df, dd, mm, st_id):
    sub_df = df[(df.start_time_dtoj.dt.day == dd) & 
                (df.start_time_dtoj.dt.month == mm) &
                (df.from_station_id == st_id)]
    if sub_df.empty:
        return 0.0, 0.0
    sm, cnt = sub_df.apply(lambda row: _convert_to_float(row), axis=1).agg(['sum', 'count'])
    return sm, cnt

#s, c = _get_stats(trip_df, 2, 7, 192)

In [122]:
def get_count_sum(d, m):
    """
    d: day in month
    m: month
    """
    low_demand_c = 0
    high_demand_c = 0
    low_demand_s = 0
    high_demand_s = 0
    for stid in tqdm(list(sd.id)):
        s, c = _get_stats(trip_df, d, m, stid)
        if c < int(sd[sd.id == stid].dpcapacity):
            low_demand_c += 1
            low_demand_s += s
        else:
            high_demand_c += 1
            high_demand_s += s
    return low_demand_c, low_demand_s, high_demand_c, high_demand_s

## Trip trend collection for a single day and a single location

In [124]:
## Helper functions

# Get net change for each trip
def _get_net(row):
    if row.incoming:
        return 1
    elif row.outgoing:
        return -1
    else:
        return 0

# Get exact time for bike rental/return for this station and then sort
def _get_time(row):
    if row.incoming:
        return row.end_time_dtoj
    elif row.outgoing:
        return row.start_time_dtoj
    else:
        return

In [209]:
# Filter trip day that meet this condition
def get_max_demand(trip_df, dd, mm, st_id):
    daily_trip_details = trip_df[
        (trip_df.start_time_dtoj.dt.day == dd) &
        (trip_df.start_time_dtoj.dt.month == mm) &
        (
            (trip_df.from_station_id == st_id) | 
            (trip_df.to_station_id == st_id)
        )
    ][['trip_id', 'tripduration', 'from_station_id', 'to_station_id', 
       'start_time_dtoj', 'end_time_dtoj']]

    # Check if incoming or outgoing
    daily_trip_details['outgoing'] = daily_trip_details.from_station_id == st_id
    daily_trip_details['incoming'] = daily_trip_details.to_station_id == st_id
    
    if daily_trip_details.empty:
        return 0
    
    daily_trip_details['net'] = daily_trip_details.apply(lambda x: _get_net(x), axis=1)

    daily_trip_details['time'] = daily_trip_details.apply(lambda x: _get_time(x), axis=1)
    daily_trip_details.sort_values(by='time', inplace=True)

    daily_trip_details['net_cum'] = -daily_trip_details['net'].cumsum()
    
    return int(daily_trip_details.net_cum.max()), int(daily_trip_details.net_cum.min())

In [210]:
# boxplot data
data_bxplt = {}

In [270]:
day_ = 7
month_ = 8
load_cache = True
file_name = '../data/max_net_demand_'+str(day_)+'_'+str(month_)+'.json'

if os.path.exists(file_name) and load_cache:
    print("Load previous data")
    with open(file_name, 'r') as f:
        tmp_dict = json.load(f)
else:
    print("Get new data")
    tmp_dict = {}
    for stid in tqdm(list(sd.id)):
        tmp_dict[str(stid)] = get_max_demand(trip_df, day_, month_, stid)
    with open('../data/max_net_demand_'+str(day_)+'_'+str(month_)+'.json', 'w') as f:
        json.dump(tmp_dict, f, indent=2)

data_bxplt[(day_, month_)] = tmp_dict

Get new data


HBox(children=(IntProgress(value=0, max=585), HTML(value='')))

In [276]:
%%time
# load data
dt_2018 = pd.read_feather('../data/Final_Divvy_data_2018.feather')

CPU times: user 102 ms, sys: 127 ms, total: 229 ms
Wall time: 140 ms


In [290]:
dt_2018[(dt_2018.month == 7) & (dt_2018.day == 4) & (dt_2018.total_out > 10)].count()

station_id                     307
month                          307
day                            307
total_in                       307
total_out                      307
year                           307
lon_ave                        307
lat_ave                        307
dp_max                         307
dp_min                         307
city_Chicago                   307
city_Evanston                  307
city_Oak_Park                  307
days_online                    307
dayofweek                      307
apparentTemperatureHigh        307
apparentTemperatureHighTime    307
apparentTemperatureLow         307
apparentTemperatureLowTime     307
apparentTemperatureMax         307
apparentTemperatureMaxTime     307
apparentTemperatureMin         307
apparentTemperatureMinTime     307
cloudCover                     307
dewPoint                       307
humidity                       307
moonPhase                      307
precipIntensity                307
precipIntensityMax  

In [292]:
dt_2018[(dt_2018.month == 7) & (dt_2018.day == 4)].total_out.sum()

15512.0

In [306]:
dt_2018[(dt_2018.month == 7) & (dt_2018.day == 2) & (dt_2018.station_id== 2)].total_out

134    79.0
Name: total_out, dtype: float64

In [273]:
data_bxplt[(2, 7)]

{'2': (8, -8),
 '3': (3, -35),
 '4': (0, -13),
 '5': (3, -2),
 '6': (18, -2),
 '7': (1, -13),
 '9': (1, -1),
 '11': (-1, -2),
 '12': (1, -1),
 '13': (19, -6),
 '14': (9, 0),
 '15': (8, -2),
 '16': (13, -2),
 '17': (5, -13),
 '18': (1, -16),
 '19': (1, -12),
 '20': (-1, -14),
 '21': (8, -10),
 '22': (10, -4),
 '23': (8, 0),
 '24': (9, -6),
 '25': (3, -6),
 '26': (33, 1),
 '27': (5, 0),
 '28': (14, 1),
 '29': (7, -7),
 '30': (10, -9),
 '31': (4, -4),
 '32': (12, 1),
 '33': (1, -15),
 '34': (4, -24),
 '35': (3, -164),
 '36': (9, -29),
 '37': (-1, -15),
 '38': (5, -13),
 '39': (-1, -23),
 '40': (2, -13),
 '41': (13, -8),
 '42': (1, -2),
 '43': (14, -92),
 '44': (5, -17),
 '45': (5, -4),
 '46': (14, 1),
 '47': (-1, -28),
 '48': (1, -53),
 '49': (7, -27),
 '50': (5, -6),
 '51': (1, -33),
 '52': (7, -29),
 '53': (5, -7),
 '54': (13, 1),
 '55': (2, -7),
 '56': (11, -2),
 '57': (1, -6),
 '58': (8, -2),
 '59': (13, -11),
 '60': (2, -9),
 '61': (7, -16),
 '62': (7, -4),
 '66': (2, -15),
 '67': (4

In [349]:
%%time
cutoff = 15
cutoff_h = 15
hist_dict = {}
# for date_ in [(1, 7)]:
for date_ in [(i, j) for i in range(1, 8) for j in [7,8]]:
    d, m = date_
    low_count = 0
    low_out_count = 0
    mid_count = 0
    mid_out_count = 0
    high_count = 0
    high_out_count = 0
    exhigh_count = 0
    exhigh_out_count = 0
    for stid in list(sd.id):
        if data_bxplt[date_][str(stid)] == 0:
            max_net = 0
        else:
            max_net = max(data_bxplt[date_][str(stid)][0], -data_bxplt[date_][str(stid)][1])
        day_out = int(dt_2018[(dt_2018.month == m) & (dt_2018.day == d) & (dt_2018.station_id == stid)].total_out)
        if day_out < 20:
            low_count += 1
            if max_net > cutoff:
                low_out_count += 1
        elif day_out < 50:
            mid_count += 1
            if max_net > cutoff:
                mid_out_count += 1
        elif day_out < 100:
            high_count += 1
            if max_net > cutoff_h:
                high_out_count += 1
        else:
            exhigh_count += 1
            if max_net > cutoff_h:
                exhigh_out_count += 1
    hist_dict[date_] = dict(
        lc=low_count, loc=low_out_count, 
        mc=mid_count, moc=mid_out_count,
        hc=high_count, hoc=high_out_count,
        ec=exhigh_count, eoc=exhigh_out_count
    )

CPU times: user 23.9 s, sys: 384 ms, total: 24.3 s
Wall time: 26.1 s


In [350]:
result = []
for tp in ['lc', 'loc', 'mc', 'moc', 'hc', 'hoc', 'ec', 'eoc']:
    tmp = [hd[tp] for hd in hist_dict.values()]
    result.append(sum(tmp)/len(tmp))

In [351]:
result

[336.14285714285717,
 0.8571428571428571,
 137.07142857142858,
 15.714285714285714,
 82.78571428571429,
 35.07142857142857,
 29.0,
 25.357142857142858]

In [352]:
ratios = [result[i*2+1]/result[i*2] for i in range(4)]

In [353]:
ratios

[0.00254993625159371,
 0.11464304325169357,
 0.4236410698878343,
 0.874384236453202]

## Plot as function of time

[Great example of make multi-type subplots](https://plot.ly/~empet/15130/mixed-2d-and-3d-subplots-forum/#/)

In [13]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from ipywidgets import widgets
import plotly.graph_objs as go
from plotly import tools

init_notebook_mode(connected=True)

In [14]:
import plotly
plotly.__version__

'3.10.0'

In [18]:
# style dict
style_dict = {}
style_dict['color'] = {'in':'red', 'out':'orange', 'net':'blue'}
style_dict['name'] = {'in':'Return', 'out':'Rental', 'net':'Net Demand'}

In [19]:
## Function to get trip trend data
def _get_trip_trend_date(df, conf, vis=False):
    return go.Scatter(
        x=df.time,
        y=df[conf+'_cum'],
        name = style_dict['name'][conf],
        mode = 'lines+markers',
        line = dict(
            color = style_dict['color'][conf],
            
        ),
        visible=vis
    )

## Function to get map data
def _get_map_data(df, vis=False):
    return go.Scattermapbox(
        lat=[float(df.latitude)],
        lon=[float(df.longitude)],
        text=["<b>Station id</b>: {}\
               <br> <b>Latitude</b>: {:.4f} \
               <br> <b>Longitude</b>: {:.4f} \
               <br> <b>Station capacity</b>: {}\
               <br> <b>Station name</b>: <br> {}\
              ".format(
            int(df.id),
            float(df.latitude), 
            float(df.longitude),
            int(df.dpcapacity),
            str(df.name.to_string()), 
        )],
        mode='markers',
        hoverinfo = 'text',
        marker=go.scattermapbox.Marker(
            size=10,
            color='yellow',
        ),   
        subplot='mapbox',
        visible=vis,
        name='Station '+str(int(df.id)),
        showlegend=False,
    )

In [26]:
## Style of mixed plot
layout = {
    'title': {
        'text': 'Trend of bike rental and net demand',
        'font': dict(
            family='Droid Serif, serif',
            size=30,
            color='Black'
        ),
    },
    'yaxis': {
        'zeroline': False,
#         'showgrid': True,
        'title': "Number of bikes",
         'titlefont': dict(
            family='Arial, sans-serif',
            size=25,
            color='grey'
        ),
#         'range': [-100, 420],
        'domain': [0, 0.95],
        'tickangle': -45,
        'tickfont': dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
    },
    'xaxis': {
        'zeroline': True,
#         'showgrid': True,
        'domain': [0., 0.99],
        'tickangle': 0,
        'tickfont': dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
    }, 
#     'paper_bgcolor': 'black',
    'showlegend': True,
    'autosize': True,
    'legend': dict(
        orientation="v", 
        x=0.5, 
        y=1,
        font=dict(
            size=16,
        ),
    ),
    'margin': go.layout.Margin(l=60, r=10, b=10, t=50, pad=6),
    'shapes': [
        go.layout.Shape(
            type="rect",
            xref="paper",
            yref="y",
            x0="0",
            y0=-35,
            x1="1",
            y1=35,
            fillcolor="lightgrey",
            opacity=0.5,
            layer="below",
            line_width=0,
        ),
        go.layout.Shape(
            type="line",
            xref="paper",
            yref="y",
            x0=0,
            y0=0,
            x1=1,
            y1=0,
            line=dict(
                color="black",
                width=0.5,
                dash='dot',
            ),
        ),
    ],
}

In [74]:
# Set date and station list
pm = 7
pd = 2

# station_list = [192, 100, 35, 91, 56]
# station_list = [192, 177, 100, 143]
station_list = list(top_station_df.from_station_id)[:30]

In [75]:
%%time
## Get data
data = []

vis_flag = True
for st_id in station_list:
    daily_trip_details, station_info = get_trip_trips(trip_df, pd, pm, st_id)

    data.append(_get_trip_trend_date(daily_trip_details, 'out', vis=vis_flag))
    data.append(_get_trip_trend_date(daily_trip_details, 'net', vis=vis_flag))
    vis_flag = False

CPU times: user 17 s, sys: 193 ms, total: 17.2 s
Wall time: 17.4 s


In [76]:
# Create button list
button_list = []
n_st = len(station_list)

blank = [False] * n_st *2

for i in range(n_st):
    vis_lst = blank.copy()
    vis_lst[i*2:i*2+2] = [True] * 2
    label_ = 'Station '+str(station_list[i])
    
    tmp_d = dict(
        args = [{'visible': vis_lst}],
        label = label_,
        method ='update'
    )
    button_list.append(tmp_d)

In [77]:
updatemenus=list([
    dict(
        buttons=button_list,
        direction = 'up',
        x = 0.82,
        xanchor = 'left',
        y = -0.18,
        yanchor = 'bottom',
        bgcolor = 'lightgrey',
        bordercolor = 'black',
        font = dict(size=11, color='black'),
        showactive=False,
    ),        
])

layout['updatemenus'] = updatemenus

In [78]:
figure = {}
figure['data'] = data
figure['layout'] = layout


SHOW = True

if SHOW:
    iplot(figure, config={'displayModeBar': False})
else:
    plot(figure, config={'displayModeBar': False}, filename="bike_trip_trend.html")