In [1]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import folium
import branca.colormap as cm

warnings.filterwarnings('ignore')

In [2]:
# Collect preprocessed data
data1 = pd.read_feather('data/Divvy_data_2017_Q1.feather')
data2 = pd.read_feather('data/Divvy_data_2017_Q2.feather')
data3 = pd.read_feather('data/Divvy_data_2017_Q3.feather')
data4 = pd.read_feather('data/Divvy_data_2017_Q4.feather')

data = pd.concat([data1, data2, data3, data4], ignore_index=True)

In [3]:
# Get the percentage of trips that biker returns the bike to the same station
same_station_ratio = sum(data['from_station_id'] == data['to_station_id'])/data.shape[0]
print(f'Percentage of bike return to the same station is {same_station_ratio*100: .2f}% ')

Percentage of bike return to the same station is  3.17% 


In [4]:
station_start_wd = data[data['dayofweek'] < 5][['from_longitude', 'from_latitude', 'tripduration']]
station_start_wd = station_start_wd.groupby(['from_longitude', 'from_latitude'])
station_start_wd = station_start_wd.count().reset_index()

station_start_we = data[data['dayofweek'] > 4 ][['from_longitude', 'from_latitude', 'tripduration']]
station_start_we = station_start_we.groupby(['from_longitude', 'from_latitude'])
station_start_we = station_start_we.count().reset_index()

In [5]:
## Stationwise plot
# Plot the frequently visited stations in weekdays and weekends
top_cutoff = 100

sorted_st_start_wd = station_start_wd.sort_values(by="tripduration", ascending=False)
long_lst_wd = sorted_st_start_wd['from_longitude'][:top_cutoff]
lat_lst_wd = sorted_st_start_wd['from_latitude'][:top_cutoff]
size_lst_wd = sorted_st_start_wd['tripduration'][:top_cutoff] / 4200 + 3

sorted_st_start_we = station_start_we.sort_values(by="tripduration", ascending=False)
long_lst_we = sorted_st_start_we['from_longitude'][:top_cutoff]
lat_lst_we = sorted_st_start_we['from_latitude'][:top_cutoff] 
size_lst_we = sorted_st_start_we['tripduration'][:top_cutoff] / 4200 + 3

map_from = folium.Map(location = [41.90, -87.64], zoom_start = 12)
wd_group = folium.FeatureGroup(name="Weekdays")
we_group = folium.FeatureGroup(name="Weekends")

# colormap = cm.linear.YlOrRd_05.scale(0, 100)
# colormap.caption = 'A colormap caption'
# map_from.add_child(colormap)
# colormap2 = cm.linear.Purples_07.scale(0, 100)
# colormap2.caption = 'A colormap caption'
# map_from.add_child(colormap2)

for lat, long, rd in zip(lat_lst_wd, long_lst_wd, size_lst_wd):
    folium.CircleMarker([lat, long], color="red", fill=True, radius=rd, weight=2, fill_opacity=0.2).add_to(wd_group)
    
for lat, long, rd in zip(lat_lst_we, long_lst_we, size_lst_we):
    folium.CircleMarker([lat, long], color="blue", fill=True, radius=rd, weight=2, fill_opacity=0.4).add_to(we_group)
    


wd_group.add_to(map_from)
we_group.add_to(map_from)


folium.LayerControl().add_to(map_from)
map_from
#map_from.save('from_station.html')

### Each marker represents a divvy bike station (based on its GPS coordinates). The size of each marker denotes the bicycle usage of that station. 


### The bike usages between weekdays and weekends are quite different. During weekdays, there are high demands of bikes in the Loop Area of Chicago, where big companies locate. During weekends, people tend to rent bike closer to the Lake Michigan, where people usually spend time with friends and families. 

### This EDA indicates that the bike usage has correlation with the location of the stations and the time period. Therefore, geo-level and time-level features should be taken into account. For example, the traffic of the day around the Divvy bike station, crime rate over past few days around the bike station and etc.