In [None]:
############################################
# 
# Marcus Bischof
# Divvy EDA : Chicago
#
############################################

# Operations
import pandas as pd
import numpy as np

# Custom functions
from functions_for_eda import *

# Data viz
from matplotlib import pyplot as plt
import seaborn as sns

# Maps
import folium
from folium import plugins

# Jupyter display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Do we need to load raw .csv, and create a single memory efficient .pkl?
CREATE_SMALL_MEMORY_SET = False
if CREATE_SMALL_MEMORY_SET:
    create_memory_efficient_pkl()

# Do we want to break up the 860+mb memory efficient .pkl into 10 slices?
CREATE_SLICES_OF_MEMORY_EFFICIENT_PKL = False
if CREATE_SLICES_OF_MEMORY_EFFICIENT_PKL:
    create_slices_of_memory_efficient_pkl()
    
df = pd.read_pickle('../data/interim/df_0_1000000.pkl')
n_hood = load_geojson_neighborhood_data()
stations = pd.read_pickle('../data/processed/stations.pkl')

-  __To create a map.__
```
create_chicago_map()
```

-  __To add points to a map.__ \*\*Note: icon must be a font-awesome icon.
```
add_points_to_map
(
        folium_map_obj, color, icon, points
)
```

-  __To add a neighborhood overlay.__
```
add_neighborhood_overlay_to_map
(
        folium_map_obj, neighborhood_name, color, n_hood_df_polylines
)
```

In [None]:
m = create_chicago_map()

In [None]:
# Thanks to https://alysivji.github.io/getting-started-with-folium.html
stations_starts = stations[['lat', 'long']].values

# plot heatmap
m.add_child(plugins.HeatMap(stations_starts, radius=10))
m

We see a signficant concentration of <i><b>divvy stations</b></i> in:
    -  The loop
    -  Northern neighborhoods on the lake like Lincoln Park 
    
For our analysis, let's first understand the data broadly. 

We will <i><b>then</b></i> start with a neighborhood centric approach to analyzing the data. I believe that since neighborhoods contain residents that may share certain commonalities, we may see interesting trends <i><b>among</b></i> and <i><b>between</b></i> various neighborhoods.

In [None]:
df.head()

In [None]:
g = sns.catplot(
    x="month", y="tripduration", hue="usertype",
    data=df, kind="violin"
)

In [None]:
df[['month', 'usertype', 'tripduration']].groupby(['month', 'usertype']).mean().dropna(how='any')

It seems as though we are only getting customer data from <i><b>July</b></i>.

How much does trip duration vary across event types?

In [None]:
g = sns.catplot(
    x="events", y="tripduration",
    data=df, kind="violin"
)

In [None]:
g = sns.catplot(
    x="from_neighborhood", y="tripduration",
    data=df, kind="violin"
)
plt.xticks(rotation=90)

In [None]:
trip_durations_by_hood = df[['from_neighborhood', 'tripduration']].groupby(['from_neighborhood']).agg(['count', 'mean']).sort_values([('tripduration', 'mean')], ascending=False).reset_index()

In [None]:
trip_durations_by_hood.head()

Let's add the top 5 neighborhoods (by average tripduration) to the map. Anything in common here?

In [None]:
for top_hood in ['Edgewater', 'Museum Campus', 'Little Village', 'Douglas', 'Gold Coast']:
    add_neighborhood_overlay_to_map(m, top_hood, 'red', n_hood)
m

I am certainly gettting the impression that the top neighborhoods in terms of average trip duration are actually neighborhoods with a small amount of stations, this makes sense.

We will do the following:
    -  The top neighborhoods (1 --> 13): yellow
    -  The middle neighborhoods (14 --> 26): green
    -  The bottom neighborhoods (27 --> 39): blue

In [None]:
neighborhood_map = create_chicago_map()
i = 1
for neighborhood in trip_durations_by_hood.itertuples():
    color = ''
    if i <= 13:
        color = 'yellow'
    elif 14 <= i <= 26:
        color = 'green'
    else:
        color = 'blue'
    i += 1
    add_neighborhood_overlay_to_map(neighborhood_map, neighborhood[1], color, n_hood)
neighborhood_map