In [None]:
############################################
# 
# Marcus Bischof
# Divvy EDA : Chicago
#
############################################

# Operations
import pandas as pd
import numpy as np

# Data viz
from matplotlib import pyplot as plt
import seaborn as sns

# Maps
import folium

# Jupyter display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Do we need to create and pickle a memory efficient version of the divvy data?
CREATE_SMALL_MEMORY_SET = True

if CREATE_SMALL_MEMORY_SET:

    df = pd.read_csv('../data/raw/divvy_data.csv') 
    
    # Objects with defined ranges (I do this to from_ and to_station because we are just dealing with Chicago)
    for object_to_cat_feature in ['gender', 'usertype', 'events', 'from_station_name', 'to_station_name']:
        df[object_to_cat_feature] = df[object_to_cat_feature].astype('category')
        
    # We will need to apply space saving operations on the data here to make it more manageable in local memory.
    # df.describe() defaults to numerics, so figure out the ranges for numerics and downsize when possible.
    df.describe().T[['min', 'max']]

    # Ints with values ranging from 0 to 255 can be stored as uint8
    for small_int_feature in ['day', 'month', 'week', 'hour', 'tripduration', 'dpcapacity_start', 'dpcapacity_end']:
        df[small_int_feature] = df[small_int_feature].astype('uint8')

    # Ints with values ranging from 0 to 65535 can be stored as uint16
    for med_int_feature in ['year', 'from_station_id']:
        df[small_int_feature] = df[small_int_feature].astype('uint16')

    for sm_float_val in ['latitude_start', 'longitude_start', 'latitude_end', 'longitude_end']:
        df[sm_float_val] = df[sm_float_val].astype('float64')

    # int8 -128 to 127
    df.temperature = df.temperature.astype('int8')
    
    df.to_pickle('../data/processed/divvy_data_small_memory.pkl')

# Do we want to load the more memory efficient pickle or not?
LOAD_FROM_SMALLER_MEMORY = True

df = pd.read_csv('../data/raw/divvy_data.csv') if not LOAD_FROM_SMALLER_MEMORY else pd.read_pickle('../data/processed/divvy_data_small_memory.pkl')

df.info()

In [None]:
# Create a DataFrame of unique stations, and their point (point == (latitude, longitude)), 
# assuming from_station and to_station exhaust the same list of stations.
stations = df[['from_station_name', 'latitude_start', 'longitude_start', 'events']].groupby(
    ['from_station_name', 'latitude_start', 'longitude_start']
).count().dropna(how='any').reset_index()
stations.pop('events')
stations.columns = ['station', 'lat', 'long']

In [None]:
# Wicker park is a nicely defined square.
# Using the Google Maps overlay seen below, combined with latitudes and longitudes taken from https://www.latlong.net/,
# we can define Wicker Park's (approximate) boundaries.

#wicker_tr = (41.914238, -87.667908)
#wicker_tl = (41.913950, -87.687312)
#wicker_bl = (41.903032, -87.687017)
#wicker_br = (41.903354, -87.667525)

wicker_tr = (41.91, -87.67)
wicker_tl = (41.91, -87.69)
wicker_bl = (41.90, -87.69)
wicker_br = (41.90, -87.67)

wicker_points = [wicker_tr, wicker_tl, wicker_bl, wicker_br]

In [None]:
#wicker_park_stations = stations[
#    (stations['lat'] <= wicker_tr[0]) & (stations['lat'] <= wicker_tl[0]) &
#    (stations['lat'] >= wicker_br[0]) & (stations['lat'] >= wicker_bl[0]) &
#    (stations['long'] >= wicker_tr[1]) & (stations['long'] >= wicker_br[1]) &
#    (stations['long'] <= wicker_tl[1]) & (stations['long'] <= wicker_bl[1])
#]

In [None]:
# Starting latitude and longitude taken from: https://alysivji.github.io/getting-started-with-folium.html
m = folium.Map([41.8781, -87.6298], zoom_start=11, tiles="CartoDB dark_matter")

# Plot the stations
for tup in stations.itertuples():
    print(tup)
    folium.Marker([tup[2], tup[3]], popup=tup[1], icon=folium.Icon(color='red')).add_to(m)

In [None]:
# Create a column that tracks whether a trip ended at the station it started at.
df['same_station_trip'] = df[['from_station_name', 'to_station_name']].apply(lambda x : x[0] == x[1],axis=1)

In [None]:
i = 0
# We want two different colors for stations where a journey originates, and stations where a journey ends.
for tup in df[['from_station_name' , 'latitude_start', 'longitude_start', 'to_station_name' , 'latitude_end', 'longitude_end']].head(100).itertuples():
    print(i)
    i+=1
    for name, lat, long, color in [(tup[1], tup[2], tup[3], "#277554"),(tup[4], tup[5], tup[6], "#983352")]:
        print('current_tup --> {}'.format((name, lat, long, color)))
        """
        folium.SimpleMarker([lat, long],
                            radius=1,
                            #popup=name,
                            fill_color=color, # divvy color
                           ).add_to(m)
        """
        folium.Marker([lat, long], popup=name, icon=folium.Icon(color=color)).add_to(m)


In [None]:
# Add square encircling wicker to the map.
folium.PolyLine(wicker_points, color='blue').add_to(m)