# Bikeshare rides
This notebook analyzes bikeshare data for Chicago Divvy bikeshare. The goal is to understand in what capacity people use bikeshare for daily activities, i.e. grocery shopping, commuting, etc. 

In [1]:
import json
import matplotlib.pyplot as plt
import mplleaflet
import pandas as pd

In [23]:
# Build the Chicago station info df (with lat, long, name)
stations_file = open('./data/Chicago/stations.json','r')
stations = stations_file.read()
stations_file.close()

stations_json = json.loads(stations)
stations_json_list = stations_json['stationBeanList']


stations_info_df = pd.DataFrame(stations_json_list)
stations_info_df.drop(['city','is_renting', 'availableDocks','availableBikes','altitude','landMark','lastCommunicationTime','postalCode','location','stAddress2','status','statusValue','statusKey','testStation','stAddress1','kioskType'],axis = 1,inplace=True)   
stations_info_df.set_index('id',inplace=True)
stations_info_df

Unnamed: 0_level_0,latitude,longitude,stationName,totalDocks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,41.876470,-87.620340,Buckingham Fountain,39
3,41.867226,-87.615355,Shedd Aquarium,55
4,41.856268,-87.613348,Burnham Harbor,23
5,41.874053,-87.627716,State St & Harrison St,23
6,41.886976,-87.612813,Dusable Harbor,39
7,41.886349,-87.617517,Field Blvd & South Water St,19
9,41.828792,-87.680604,Leavitt St & Archer Ave,14
11,41.766638,-87.576450,Jeffery Blvd & 71st St,11
12,41.766409,-87.565688,South Shore Dr & 71st St,15
13,41.932418,-87.652705,Wilton Ave & Diversey Pkwy,35


In [3]:
# Read in Chicago 2018 Q4 data
df = pd.read_csv('./data/Chicago/Divvy_Trips_2018_Q4/Divvy_Trips_2018_Q4.csv')
df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,20983530,2018-10-01 00:01:17,2018-10-01 00:29:35,4551,1698.0,85,Michigan Ave & Oak St,166,Ashland Ave & Wrightwood Ave,Subscriber,Male,1992.0
1,20983531,2018-10-01 00:03:59,2018-10-01 00:10:55,847,416.0,13,Wilton Ave & Diversey Pkwy,144,Larrabee St & Webster Ave,Subscriber,Female,1982.0
2,20983532,2018-10-01 00:05:14,2018-10-01 00:14:08,6188,534.0,59,Wabash Ave & Roosevelt Rd,197,Michigan Ave & Madison St,Subscriber,Male,1986.0
3,20983533,2018-10-01 00:05:48,2018-10-01 00:18:46,6372,778.0,328,Ellis Ave & 58th St,419,Lake Park Ave & 53rd St,Subscriber,Female,1960.0
4,20983534,2018-10-01 00:07:29,2018-10-01 00:25:51,1927,1102.0,93,Sheffield Ave & Willow St,159,Claremont Ave & Hirsch St,Subscriber,Female,1993.0


In [4]:
subs_df = df[df['usertype'] == 'Subscriber'].copy()

In [5]:
subs_df.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,20983530,2018-10-01 00:01:17,2018-10-01 00:29:35,4551,1698.0,85,Michigan Ave & Oak St,166,Ashland Ave & Wrightwood Ave,Subscriber,Male,1992.0
1,20983531,2018-10-01 00:03:59,2018-10-01 00:10:55,847,416.0,13,Wilton Ave & Diversey Pkwy,144,Larrabee St & Webster Ave,Subscriber,Female,1982.0
2,20983532,2018-10-01 00:05:14,2018-10-01 00:14:08,6188,534.0,59,Wabash Ave & Roosevelt Rd,197,Michigan Ave & Madison St,Subscriber,Male,1986.0
3,20983533,2018-10-01 00:05:48,2018-10-01 00:18:46,6372,778.0,328,Ellis Ave & 58th St,419,Lake Park Ave & 53rd St,Subscriber,Female,1960.0
4,20983534,2018-10-01 00:07:29,2018-10-01 00:25:51,1927,1102.0,93,Sheffield Ave & Willow St,159,Claremont Ave & Hirsch St,Subscriber,Female,1993.0


In [7]:
subs_df['start-end station'] = tuple(zip(subs_df['from_station_id'], subs_df['to_station_id']))
# Other ways (much slower):
# df['start-end station']  = df[['start station id','end station id']].apply(list, axis=1)
# df['start-end station'].apply(sorted)

start_end = subs_df['start-end station']
subs_df['start-end station'] = start_end.map(lambda x: tuple(sorted(x)))

In [9]:
subs_df = subs_df.sort_values('start-end station')


In [12]:
subs_df_groupBy = subs_df.groupby('start-end station',axis=0)
trips_ser = subs_df_groupBy['start_time'].count()
top_5000_ser = trips_ser.sort_values(ascending = False)[0:5000]

In [14]:
#stations_info_df = df[['start station id','start station latitude', 'start station longitude','start station name']].set_index('start station id')

In [169]:
#stations_info_df = station_info_df.drop_duplicates()
#stations_info_df.head()

Unnamed: 0_level_0,start station latitude,start station longitude
start station id,Unnamed: 1_level_1,Unnamed: 2_level_1
72,40.767272,-73.993929
79,40.719116,-74.006667
82,40.711174,-74.000165
83,40.683826,-73.976323
119,40.696089,-73.978034


In [30]:
import matplotlib.pyplot as plt
import mplleaflet
import pandas as pd
from matplotlib import cm


def leaflet_plot_stations():

    station_locations_by_hash = df[df['hash'] == hashid]

    lons = station_locations_by_hash['LONGITUDE'].tolist()
    lats = station_locations_by_hash['LATITUDE'].tolist()

    plt.figure(figsize=(8,8))

    plt.scatter(lons, lats, c='r', alpha=0.7, s=200)

    return mplleaflet.display()

plt.figure(figsize=(8,16))

num_pairs = 200

cmap = cm.get_cmap('jet')

max_trips = top_5000_ser[0]

for item in top_5000_ser.index[:num_pairs]:
    
    startlat = stations_info_df.loc[item[0]]['latitude']
    endlat = stations_info_df.loc[item[1]]['latitude']
    lats = [startlat, endlat]
    
    startlon = stations_info_df.loc[item[0]]['longitude']
    endlon = stations_info_df.loc[item[1]]['longitude']
    lons = [startlon,endlon]
    
    
    #plt.scatter(lons, lats, c='r', alpha=0.7, s=10)
    plt.plot(lons, lats, color = cmap(top_5000_ser[item]/max_trips),linewidth = 1.5,alpha=.5) # Draw blue line
    
    #cmap(top_2000_ser[item]/max_trips * 0.2)

station_lats = stations_info_df['latitude']
station_lons = stations_info_df['longitude']
station_num_bikes = stations_info_df['totalDocks']

plt.scatter(station_lons, station_lats, color ='r', alpha = .5, s = station_num_bikes)

mplleaflet.display(tiles='cartodb_positron')
    
#plt.figure(figsize=(8,8))

#lats = [52.0907, 47.9990]
#lons = [5.1214, 7.8421]

#plt.scatter(lons, lats, c='g', alpha=0.7, s=50)
#plt.plot(lons, lats, 'b') # Draw blue line

#mplleaflet.display()

In [161]:
top_5000_ser.head(20)

start-end station
(460, 3093)     613
(3093, 460)     606
(519, 492)      532
(432, 3263)     530
(435, 509)      505
(258, 324)      475
(3430, 3086)    428
(481, 3093)     424
(3086, 3430)    413
(462, 453)      394
(519, 498)      388
(3351, 3318)    382
(494, 3258)     371
(3256, 426)     362
(426, 3256)     360
(3118, 3119)    358
(363, 3002)     348
(514, 426)      347
(458, 494)      346
(519, 491)      341
Name: starttime, dtype: int64

In [179]:
stations_info_df_test = df[['start station id','start station latitude', 'start station longitude','start station name']]
stations_info_df_test = stations_info_df_test.set_index('start station id')
stations_info_df_test=stations_info_df_test.drop_duplicates()
print(stations_info_df_test.loc[460]['start station name'])
print(stations_info_df_test.loc[3093]['start station name'])


S 4 St & Wythe Ave
N 6 St & Bedford Ave
