In [None]:
############################################
# 
# Marcus Bischof
# Divvy EDA : Chicago
#
############################################

# Operations
import pandas as pd
import numpy as np

# Image libs
from PIL import Image, ImageChops
from folium.raster_layers import ImageOverlay

# Custom functions
from functions_for_eda import *

# Data viz
from matplotlib import pyplot as plt
import seaborn as sns

# Maps
import folium
from folium import plugins

# Jupyter display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Do we need to load raw .csv, and create a single memory efficient .pkl?
CREATE_SMALL_MEMORY_SET = False
if CREATE_SMALL_MEMORY_SET:
    create_memory_efficient_pkl()

# Do we want to break up the 860+mb memory efficient .pkl into 10 slices?
CREATE_SLICES_OF_MEMORY_EFFICIENT_PKL = False
if CREATE_SLICES_OF_MEMORY_EFFICIENT_PKL:
    create_slices_of_memory_efficient_pkl()

# One tenth of the divvy data, to be used for exploration.
df = pd.read_pickle('../data/interim/df_0_1000000.pkl')
# Neighborhoods and geo.
n_hood = load_geojson_neighborhood_data()
stations = pd.read_pickle('../data/processed/stations.pkl')

# Divvy Data Description on Kaggle's Site
------------

-  __trip_idID__ attached to each trip taken
-  __year__ year
-  __month__ month
-  __week__ week No.
-  __day__
-  __hour__
-  __usertype__ 
    -  _Customer_ is a rider who purchased a 24-Hour Pass
    -  _Subscriber_ is a rider who purchased an Annual -  Membership
-  __gender__
-  __starttime__ day and time trip started, in CST
-  __stoptime__ day and time trip ended, in CST
-  __tripduration__ time of trip in minutes
-  __temperature__
-  __events__
-  __from_station_id__ ID of station where trip originated
-  __from_station_name__ name of station where trip terminated
-  __latitude_start__ station latitude
-  __longitude_start__ station longitude
-  __dpcapacity_start__ number of total docks at each station
-  __to_station_id__
-  __to_station_name__
-  __latitude_end__
-  __longitude_end__
-  __dpcapacity_end__ number of total docks at each station

# Custom Functions that I wrote for this Analysis
------------

-  __To create a map.__
```
create_chicago_map()
```

-  __To add points to a map.__ \*\*Note: icon must be a font-awesome icon.
```
add_points_to_map
(
        folium_map_obj, color, icon, points
)
```

-  __To add a neighborhood overlay.__
```
add_neighborhood_overlay_to_map
(
        folium_map_obj, neighborhood_name, color, n_hood_df_polylines
)
```

In [None]:
m = create_chicago_map()

In [None]:
# Thanks to https://alysivji.github.io/getting-started-with-folium.html
stations_starts = stations[['lat', 'long']].values

# plot heatmap
m.add_child(plugins.HeatMap(stations_starts, radius=10))
m

We see a signficant concentration of <i><b>divvy stations</b></i> in:
    -  The loop
    -  Northern neighborhoods on the lake like Lincoln Park 
    
For our analysis, let's first understand the data broadly. 

We will <i><b>then</b></i> start with a neighborhood centric approach to analyzing the data. I believe that since neighborhoods contain residents that may share certain commonalities, we may see interesting trends <i><b>among</b></i> and <i><b>between</b></i> various neighborhoods.

In [None]:
df.head()

In [None]:
g = sns.catplot(
    x="month", y="tripduration", hue="usertype",
    data=df, kind="violin"
)

In [None]:
df[['month', 'usertype', 'tripduration']].groupby(['month', 'usertype']).mean().dropna(how='any')

# Trip Duration Analysis Below

It seems as though we are only getting customer data from <i><b>July</b></i>.

How much does trip duration vary across event types?

In [None]:
g = sns.catplot(
    x="events", y="tripduration",
    data=df, kind="violin"
)

In [None]:
#g = sns.catplot(
#    x="events", y="tripduration", col="from_neighborhood",
#    data=df, kind="violin"
#)
#plt.xticks(rotation=90)

In [None]:
trip_durations_by_hood = df[['from_neighborhood', 'tripduration']].groupby(['from_neighborhood']).agg(['count', 'mean']).sort_values([('tripduration', 'mean')], ascending=False).reset_index()

In [None]:
trip_durations_by_hood.head()

Let's add the top 5 neighborhoods (by average tripduration) to the map. Anything in common here?

In [None]:
for top_hood in ['Edgewater', 'Museum Campus', 'Little Village', 'Douglas', 'Gold Coast']:
    add_neighborhood_overlay_to_map(m, top_hood, 'red', n_hood)
m

I am certainly gettting the impression that the top neighborhoods in terms of average trip duration are actually neighborhoods with a small amount of stations, this makes sense.

We will do the following:
    -  The top neighborhoods (1 --> 13): yellow
    -  The middle neighborhoods (14 --> 26): green
    -  The bottom neighborhoods (27 --> 39): blue

In [None]:
trip_durations_by_hood['color'] = pd.cut(np.array(trip_durations_by_hood['tripduration']['mean']), 3, labels=["yellow", "green", "blue"])

In [None]:
neighborhood_map = create_chicago_map()

i = 1
for neighborhood in trip_durations_by_hood.itertuples():
    add_neighborhood_overlay_to_map(neighborhood_map, neighborhood[1], neighborhood[4], n_hood)
    
def trim(img):
    border = Image.new(img.mode, img.size, img.getpixel((0, 0)))
    diff = ImageChops.difference(img, border)
    diff = ImageChops.add(diff, diff, 2.0, -100)
    bbox = diff.getbbox()
    if bbox:
        img = img.crop(bbox)
    return np.array(img)

with Image.open('avg_trip_duration.png') as img:
    image = trim(img)

# We add our legend as an image.
ImageOverlay(
    image=image,
    bounds=[[41.954883, -87.594551], [41.894883, -87.494551]],
    zindex=1,
).add_to(neighborhood_map)

neighborhood_map

In [None]:
df[['week', 'tripduration']].groupby('week').mean().plot.bar(title="Avg. Trip Duration per Week")

Confirm below that week one corresponds to the first week of january as expected. The <font color='green'>chart above</font> makes sense now, as we would expect the average duration of trips to be shorter in the winter and longer in the summer.

In [None]:
df[df.week == 1].head()

# Station Capacity Analysis Below

We want to understand how capacity can potentially be analyzed and predicted. 

Let's first understand how neighborhoods differ when it comes to the percentage of trips that end at a different neighborhood vs. trips that end in the same neighborhood. Likewise, let's examine the same statistic for station to station trips.

In [None]:
n_hood_different_neighborhood_ratios = []
for n in df.from_neighborhood.unique():
    n_hood_different_neighborhood_ratios.append((n, len(df[(df.from_neighborhood == n) & (df.to_neighborhood != n)]) / len(df[df.from_neighborhood == n])))
n_hood_different_neighborhood_ratios.sort(key=lambda tup: tup[1])

In [None]:
neighborhood_diff_trip_end_density = create_chicago_map()
for n, density in n_hood_different_neighborhood_ratios:
    add_neighborhood_overlay_to_map_with_fill(neighborhood_diff_trip_end_density, n, 'yellow', n_hood, density)

In [None]:
neighborhood_diff_trip_end_density

-  We do notice that smaller neighborhoods tend to have more trips that end outside of them. This makes perfect sense riders are interested in going places, and the liklihood that a trips ends outside of a neighborhood boundry should probably increase the smaller a neighborhood is (or the vicinity of a station to a neighborhood edge for that matter).


-  Furthermore, we notice that neighborhoods on the outer edge (i.e edge divvy stations) tend to have a higher density, which may indicate that trips tend to gravitate towards the center. This makes sense to me for two reasons:
      -  Out of sheer necessity, if there is not a divvy station further from an edge, it is more likely that bikers will go in directions (central) that have divvy stations.
      -  Assuming a significant portion of traffic involves people going to work or going to activities (bars, skating rinks, you name it..), there is a higher concentration of workplaces and activities the closer to the transportation hubs that you are (i.e. OTC, Union Stations, CTA stops...)  

In [None]:
df.head()

In [None]:
# Let's validate that many trips take place between typical work hours as hypothesized above.
df[['week','hour']].groupby(['hour']).count().reset_index().rename(columns={"week": "count"}).plot.bar(x='hour', y='count', title='Amount of trips that start during this hour (military hours)')

The bimodal distribution above with peaks at <font color='green'>__8am__</font> in the morning and <font color='green'>__5pm__</font> in the evening suggest that my hypothesis regarding work trips is at least partially correct.

Let's examine capacity as it relates to various stations.

In [None]:
df['ym'] = df.starttime.apply(
#    lambda x : str(x.split(' ')[0].split('-')[0]) + str(x.split(' ')[0].split('-')[1]) + str(x.split(' ')[0].split('-')[2])
    lambda x : str(x.split(' ')[0].split('-')[0]) + str(x.split(' ')[0].split('-')[1])
)

In [None]:

for station in df[df.from_neighborhood == 'Wicker Park'].from_station_name.unique():
    # Pick a station, calculate the amount of trips from that station per day, let's start with a station near and dear to my heart, Ashland Ave & Division St
    trips_from = df[df.from_station_name == station][
        ['dpcapacity_start', 'ym', 'same_neighborhood_trip']
    ].groupby(['ym', 'same_neighborhood_trip']).count().sort_values(by=[
        'ym'
    ], ascending=True).reset_index()
    trips_from.columns = ['year_month', 'same_hood', 'count']
    ax = sns.lineplot(x="year_month", y="count", hue="same_hood", data=trips_from)
    ax.set_title(station)
    plt.show()

In [None]:
for n in df.from_neighborhood.unique():
    # Pick a station, calculate the amount of trips from that station per day, let's start with a station near and dear to my heart, Ashland Ave & Division St
    trips_from = df[df.from_neighborhood == n][
        ['dpcapacity_start', 'ym', 'same_neighborhood_trip']
    ].groupby(['ym', 'same_neighborhood_trip']).count().sort_values(by=[
        'ym'
    ], ascending=True).reset_index()
    trips_from.columns = ['year_month', 'same_hood', 'count']
    ax = sns.lineplot(x="year_month", y="count", hue="same_hood", data=trips_from)
    ax.set_title(n)
    plt.show()

-  <i>What stands out to me the most above</i>
    -  There is a <b>MASSIVE</b> difference amoung the various neighborhoods in terms of sheer trip volume
        -  E.g. The Loop and Lincoln Park are totally different beasts than Kenwood and Wicker Park
    -  Neighborhoods that have close to similar parity between same neighborhood trips & diff neighborhood trips
    -  Neighborhoods with different neighborhood trips domainating same neighborhood trips

In [None]:
g = sns.catplot(
    x="same_neighborhood_trip", y="tripduration", col="gender",
    data=df, kind="box"
)

Girls are representing and biking longer! (IQR 75th is higher)

How far off are capacities of from and to trips?

In [None]:
df['dpcapacity_start'] = df['dpcapacity_start'].astype('float')
df['dpcapacity_end'] = df['dpcapacity_end'].astype('float')
df['capacity_diff'] = df['dpcapacity_start'] - df['dpcapacity_end']

In [None]:
# Juicy information ...
inflow_outflow_df = df[['from_neighborhood', 'capacity_diff']].groupby(['from_neighborhood']).mean().reset_index()
outflow_max = inflow_outflow_df['capacity_diff'].max()
inflow_min = inflow_outflow_df['capacity_diff'].min()

In [None]:
inflow_outflow_map = create_chicago_map()
for _, n, density in inflow_outflow_df.itertuples():
    if density < 0:
        c, d = 'red', abs(density) / abs(inflow_min)
    else:
        c, d = 'green', density / outflow_max
    add_neighborhood_overlay_to_map_with_fill(inflow_outflow_map, n, c, n_hood, d)

In [None]:
inflow_outflow_map