# Bikeshare rides -- station classification (Winter 2018)
This notebook explores some ways of classifying stations using various usage parameters. 

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplleaflet
import pandas as pd
import numpy as np
import json
import math
from matplotlib import cm

#import os

#os.environ['PROJ_LIB'] = r'C:\Users\pgsze\Anaconda3\pkgs\proj4-5.2.0-ha925a31_1\Library\share'

from mpl_toolkits.basemap import Basemap

# Import, and set to log to the console.  (See the console which is running
# Jupyter notebook for logging about HTTP requests.)
#import tilemapbase
#tilemapbase.start_logging()

In [2]:
###
subs_df = pd.read_pickle('./data/pickledDFs/subs_df-NYC_Winter_2018.pkl')
stations_info_df = pd.read_pickle('./data/pickledDFs/stations_info_df-NYC_Winter_2018.pkl')
subs_trips_df = pd.read_pickle('./data/pickledDFs/subs_trips_df-NYC_Winter_2018.pkl')
###

In [None]:
#Commented out, already saved in pickles that are read above
#winter_df_list = []
#
#for el in ['01','02','03']:
#    winter_df_list.append(pd.read_csv('./data/NYC/2018/2018{}-citibike-tripdata/2018{}-citibike-tripdata.csv'.format(el,el)))
#
##df = pd.read_csv('./data/NYC/2018/201806-citibike-tripdata/201806-citibike-tripdata.csv')
#winter_df_list[0].head()

In [None]:
#winter_df = pd.concat(winter_df_list,ignore_index=True)#, keys = ['Jun', 'Jul', 'Aug'])
#winter_df.head()

In [3]:
#Number of weekend days and weekday days for Jan - March 2018
num_WE = 25
num_WD = 31+28+31-num_WE #Note I'm not subtracting holidays here, maybe later...

In [6]:
weekday_subs_trips_df = subs_trips_df[subs_trips_df['Trip_Type'] != 'Weekend'].copy()
weekend_subs_trips_df = subs_trips_df[subs_trips_df['Trip_Type'] == 'Weekend'].copy()

In [83]:
sorted_stations_df = subs_trips_df.groupby('start station id').count()['tripduration'].sort_values(ascending=False).reset_index()
sorted_stations_df.rename({'tripduration': 'total_trips'},axis=1,inplace=True)
sorted_stations_df['rank'] = sorted_stations_df.index
#sorted_stations_df['good']

fig = plt.figure()
ax = fig.gca()

ax.scatter(sorted_stations_df.index,sorted_stations_df['total_trips'],marker='s',s=5)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1b6a50df6d8>

In [87]:
fit=np.polyfit(sorted_stations_df.index[100:700],np.log(sorted_stations_df['total_trips'][100:700]), 1)

fit_line = np.array(range(800))*fit[0]+np.full((1,800),fit[1])

fig = plt.figure()
ax = fig.gca()

#ax.scatter(range(800),fit_line)
ax.plot((1,799),(fit_line[0][1],fit_line[0][799]),color='red')
ax.scatter(sorted_stations_df.index,np.log(sorted_stations_df['total_trips']),marker='s',s=5)


<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x1b6c08f3588>

In [88]:
hour_stations_gb = weekday_subs_trips_df.groupby(['pickup_hour','start station id'])
count_by_hour_station_df = hour_stations_gb.count()['tripduration'].rename('num_trips')
num_trips_df = count_by_hour_station_df.reset_index(level=0).pivot(columns='pickup_hour')
num_trips_df['total_trips'] = num_trips_df.sum(axis=1)

sorted_num_trips_df = num_trips_df.sort_values('total_trips',ascending=False).fillna(0)
sorted_num_trips_df.astype(int)

total = sorted_stations_df['total_trips'].sum()



In [89]:
fig, ax = plt.subplots(nrows = 4, ncols = 6, figsize=(8,12))
ax = ax.flatten()

for num in range(24):
    
    if num == 0:
        ax[num].set_title('{} AM'.format(str(12)),fontsize = 8)
    elif num == 12:
        ax[num].set_title('{} PM'.format(str(12)),fontsize = 8)
    elif num < 12:
        ax[num].set_title('{} AM'.format(str(num % 12)),fontsize = 8)
    else:
        ax[num].set_title('{} PM'.format(str(num % 12)),fontsize = 8)

    #ax[num].set_title('hour = {}'.format(num),fontsize = 8)
    ax[num].scatter(range(len(sorted_num_trips_df.index)),sorted_num_trips_df['num_trips',num],s=1)
    ax[num].get_xaxis().set_visible(False)
    ax[num].tick_params(labelsize=8)
    weight = sorted_num_trips_df['num_trips',num].sum()
    #ax[num].scatter(sorted_stations_df.index,np.divide(np.array(sorted_stations_df['total_trips'])*weight,total),marker='s',color='red',s=1)
    ax[num].scatter(sorted_stations_df.index,np.divide(np.array(sorted_stations_df['total_trips']),24),marker='s',color='red',s=1)


<IPython.core.display.Javascript object>

In [90]:
fig, ax = plt.subplots(nrows = 4, ncols = 6, figsize=(8,12))
ax = ax.flatten()

for num in range(24):
    
    if num == 0:
        ax[num].set_title('{} AM'.format(str(12)),fontsize = 8)
    elif num == 12:
        ax[num].set_title('{} PM'.format(str(12)),fontsize = 8)
    elif num < 12:
        ax[num].set_title('{} AM'.format(str(num % 12)),fontsize = 8)
    else:
        ax[num].set_title('{} PM'.format(str(num % 12)),fontsize = 8)

    #ax[num].set_title('hour = {}'.format(num),fontsize = 8)
    ax[num].scatter(range(len(sorted_num_trips_df.index)),sorted_num_trips_df['num_trips',num],s=1)
    ax[num].get_xaxis().set_visible(False)
    ax[num].tick_params(labelsize=8)
    weight = sorted_num_trips_df['num_trips',num].sum()
    ax[num].scatter(sorted_stations_df.index,np.divide(np.array(sorted_stations_df['total_trips'])*weight,total),marker='s',color='red',s=1)
    #ax[num].scatter(sorted_stations_df.index,np.divide(np.array(sorted_stations_df['total_trips']),24),marker='s',color='red',s=1)


<IPython.core.display.Javascript object>

In [91]:

fig, ax = plt.subplots(nrows = 4, ncols = 6, figsize=(8,12))
ax = ax.flatten()

for num in range(24):
    
    if num == 0:
        ax[num].set_title('{} AM'.format(str(12)),fontsize = 8)
    elif num == 12:
        ax[num].set_title('{} PM'.format(str(12)),fontsize = 8)
    elif num < 12:
        ax[num].set_title('{} AM'.format(str(num % 12)),fontsize = 8)
    else:
        ax[num].set_title('{} PM'.format(str(num % 12)),fontsize = 8)
    
    #ax[num].set_title('hour = {}'.format(num),fontsize = 8)
   
    weight = sorted_num_trips_df['num_trips',num].sum()
    normalized = np.divide(np.array(sorted_stations_df['total_trips'])*weight,total)
    
    diff_score = np.divide(sorted_num_trips_df['num_trips',num].values - normalized,normalized)
    
    average =  np.divide(np.array(sorted_stations_df['total_trips']),24)
    diff_score_2 = np.divide(sorted_num_trips_df['num_trips',num].values -average,average)
    
    #diff = sorted_num_trips_df['num_trips',num].values - normalized
    ax[num].scatter(range(len(sorted_num_trips_df.index)),diff_score_2,s=1)
    ax[num].get_xaxis().set_visible(False)
    ax[num].tick_params(labelsize=8)
    
    

<IPython.core.display.Javascript object>

In [92]:
for num in range(24):
    weight = sorted_num_trips_df['num_trips',num].sum()
    normalized = np.divide(np.array(sorted_stations_df['total_trips'])*weight,total)
    diff_score = np.divide(sorted_num_trips_df['num_trips',num].values - normalized,normalized)
    #diff_score = np.divide(sorted_num_trips_df['num_trips',num].values - normalized,sorted_num_trips_df['num_trips',num].values)
    
    average =  np.divide(np.array(sorted_stations_df['total_trips']),24)
    diff_score_2 = np.divide(sorted_num_trips_df['num_trips',num].values -average,average)
    
    sorted_num_trips_df['diff',num] = sorted_num_trips_df['num_trips',num].values - normalized
    sorted_num_trips_df['diff_score',num] = diff_score
    sorted_num_trips_df['diff_score_2',num] = diff_score_2

sorted_num_trips_df['max_diff'] = sorted_num_trips_df['diff'].max(axis=1) 
sorted_num_trips_df['max_diff_time'] = sorted_num_trips_df['diff'].idxmax(axis=1)  

sorted_num_trips_df['max_trips_time'] = sorted_num_trips_df['num_trips'].idxmax(axis=1)

sorted_num_trips_df['max_diff_score'] = sorted_num_trips_df['diff_score'].max(axis=1)  
sorted_num_trips_df['max_diff_score_time'] = sorted_num_trips_df['diff_score'].idxmax(axis=1)  

sorted_num_trips_df['max_diff_score_2'] = sorted_num_trips_df['diff_score_2'].max(axis=1)  
sorted_num_trips_df['max_diff_score_2_time'] = sorted_num_trips_df['diff_score_2'].idxmax(axis=1)  

sorted_num_trips_df['late_night_score'] = sorted_num_trips_df['diff_score',0]
for num in [1,2,3,4,20,21,22,23]:
    sorted_num_trips_df['late_night_score'] += sorted_num_trips_df['diff_score',num]
sorted_num_trips_df['late_night_score'] = sorted_num_trips_df['late_night_score']/len([0,1,2,3,4,20,21,22,23])
    
sorted_num_trips_df['commuter_score'] = sorted_num_trips_df['diff_score',5]
for num in [7,8,9,16,17,18,19]:
    sorted_num_trips_df['commuter_score'] += sorted_num_trips_df['diff_score',num]
sorted_num_trips_df['commuter_score'] = sorted_num_trips_df['commuter_score']/len([5,6,7,8,9,16,17,18,19])

sorted_num_trips_df['midday_score'] = sorted_num_trips_df['diff_score',10]
for num in [11,12,13,14,15]:
    sorted_num_trips_df['midday_score']+=sorted_num_trips_df['diff_score',num]
sorted_num_trips_df['midday_score'] = sorted_num_trips_df['midday_score']/len([10,11,12,13,14,15])


pd.set_option('display.max_columns', None)


sorted_num_trips_df = sorted_num_trips_df.assign(Type =
    np.select(
        condlist=[sorted_num_trips_df['max_diff_score_time'] < 5, sorted_num_trips_df['max_diff_score_time'] <10, sorted_num_trips_df['max_diff_score_time']< 16, sorted_num_trips_df['max_diff_score_time'] <20, sorted_num_trips_df['max_diff_score_time'] < 24], 
        choicelist=['Late Night','Commuter','Midday','Commuter','Late Night'], 
        default='Other'))

#sorted_num_trips_df[['diff_score_2','max_diff_time','max_diff_score_time','max_trips_time','max_diff_score_2_time']]

type_scores=sorted_num_trips_df[['late_night_score', 'commuter_score', 'midday_score']]

sorted_num_trips_df = sorted_num_trips_df.assign(Type2 =
    np.select(
        condlist=[type_scores.idxmax(axis=1) == ('late_night_score',''),type_scores.idxmax(axis=1) == ('commuter_score',''),type_scores.idxmax(axis=1) == ('midday_score','')], 
        choicelist=['Late Night','Commuter','Midday'], 
        default='Other'))

sorted_num_trips_df[['num_trips','Type','Type2']]
#type_scores.idxmax(axis=1)

Unnamed: 0_level_0,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,num_trips,Type,Type2
pickup_hour,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,Unnamed: 25_level_1,Unnamed: 26_level_1
start station id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2
519,74.0,45.0,8.0,6.0,4.0,284.0,692.0,1839.0,2760.0,2303.0,665.0,417.0,452.0,465.0,566.0,822.0,1631.0,3937.0,3945.0,2106.0,856.0,413.0,250.0,120.0,Commuter,Commuter
3255,38.0,31.0,10.0,6.0,65.0,194.0,2935.0,2823.0,2169.0,1140.0,434.0,245.0,266.0,268.0,281.0,277.0,353.0,607.0,961.0,521.0,269.0,212.0,182.0,84.0,Commuter,Commuter
402,34.0,18.0,7.0,9.0,7.0,23.0,132.0,332.0,616.0,590.0,460.0,485.0,642.0,651.0,721.0,746.0,992.0,2253.0,2787.0,1281.0,458.0,235.0,160.0,67.0,Commuter,Midday
435,45.0,12.0,2.0,4.0,14.0,48.0,118.0,361.0,843.0,861.0,489.0,595.0,728.0,895.0,922.0,847.0,966.0,1542.0,1723.0,930.0,545.0,366.0,136.0,91.0,Midday,Midday
490,68.0,23.0,19.0,20.0,94.0,332.0,773.0,1017.0,1547.0,1065.0,334.0,282.0,368.0,344.0,381.0,512.0,754.0,1355.0,1376.0,694.0,452.0,416.0,356.0,148.0,Late Night,Late Night
477,128.0,100.0,29.0,33.0,21.0,1040.0,1494.0,427.0,684.0,822.0,374.0,251.0,238.0,358.0,365.0,534.0,458.0,1158.0,1604.0,880.0,540.0,325.0,382.0,205.0,Commuter,Commuter
497,36.0,12.0,4.0,6.0,11.0,55.0,111.0,282.0,606.0,727.0,626.0,507.0,734.0,805.0,800.0,796.0,1032.0,1635.0,1539.0,914.0,562.0,325.0,182.0,97.0,Midday,Midday
359,16.0,15.0,4.0,3.0,2.0,72.0,444.0,808.0,1095.0,568.0,238.0,208.0,269.0,286.0,318.0,396.0,1165.0,3242.0,2181.0,567.0,183.0,84.0,45.0,33.0,Commuter,Commuter
523,60.0,44.0,44.0,9.0,32.0,206.0,635.0,612.0,1063.0,1553.0,500.0,347.0,427.0,404.0,448.0,426.0,602.0,1345.0,1581.0,738.0,474.0,303.0,197.0,87.0,Late Night,Late Night
379,28.0,33.0,18.0,7.0,122.0,197.0,800.0,913.0,954.0,850.0,308.0,249.0,305.0,307.0,401.0,423.0,503.0,1215.0,1348.0,743.0,402.0,330.0,236.0,118.0,Late Night,Late Night
