In [7]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import re
import json

from useful_functions import *

warnings.filterwarnings('ignore')

## Load station data

In [2]:
st_data = pd.read_feather('data/final_station_data.feather')

In [4]:
st_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 12 columns):
index            618 non-null int64
id               618 non-null int64
lon_ave          618 non-null float64
lat_ave          618 non-null float64
dp_max           618 non-null float64
dp_min           618 non-null float64
online_month     618 non-null float64
online_day       618 non-null float64
online_year      618 non-null float64
city_Chicago     618 non-null uint8
city_Evanston    618 non-null uint8
city_Oak_Park    618 non-null uint8
dtypes: float64(7), int64(2), uint8(3)
memory usage: 45.3 KB


## Load geodata from GeoJson

In [8]:
# Load zip geojson
with open('geo_data/Boundaries_Community_Areas.geojson', 'r') as p:
    ca = json.load(p)

In [23]:
from shapely.geometry import shape, Point

# Merge all polygons
ca_dict = {}
for feature in ca['features']:
    name = feature['properties']['community']
    polygon = shape(feature['geometry'])
    ca_dict[name] = polygon

## Assign community area for different stations

In [63]:
def _assign_ca(row):
    lat = row.lat_ave
    lon = row.lon_ave
    p = Point(lon, lat)
    for ca, plygn in ca_dict.items():
        if plygn.contains(p):
            return '_'.join(ca.strip().split(' '))
        else:
            continue
    if row.city_Evanston == 1:
        return 'Evanston'
    elif row.city_Oak_Park == 1:
        return 'Oak_Park'

In [64]:
st_data['community_area'] = st_data.apply(lambda x: _assign_ca(x), axis=1)

In [65]:
st_data['community_area'].unique()

array(['LOOP', 'NEAR_SOUTH_SIDE', 'MCKINLEY_PARK', 'SOUTH_SHORE',
       'LINCOLN_PARK', 'LOWER_WEST_SIDE', 'WEST_TOWN', 'NEAR_WEST_SIDE',
       'NEAR_NORTH_SIDE', 'LOGAN_SQUARE', 'WOODLAWN', 'LAKE_VIEW',
       'ARMOUR_SQUARE', 'KENWOOD', 'DOUGLAS', 'NORTH_CENTER',
       'GRAND_BOULEVARD', 'WASHINGTON_PARK', 'BRIDGEPORT', 'UPTOWN',
       'LINCOLN_SQUARE', 'HYDE_PARK', 'EDGEWATER', 'SOUTH_LAWNDALE',
       'ROGERS_PARK', 'HUMBOLDT_PARK', 'EAST_GARFIELD_PARK', 'NEW_CITY',
       'FULLER_PARK', 'ENGLEWOOD', 'GREATER_GRAND_CROSSING',
       'NORTH_LAWNDALE', 'WEST_RIDGE', 'NORTH_PARK', 'ALBANY_PARK',
       'IRVING_PARK', 'AVONDALE', 'AUSTIN', 'WEST_GARFIELD_PARK',
       'WEST_ENGLEWOOD', 'CHATHAM', 'AVALON_PARK', 'SOUTH_CHICAGO',
       'PORTAGE_PARK', 'Evanston', 'HERMOSA', 'Oak_Park'], dtype=object)

## OHE for community area

In [68]:
st_data = st_data.drop(['index', 'dp_min', 'online_month', 'online_day', 'online_year', 
                        'city_Chicago', 'city_Evanston', 'city_Oak_Park'], axis=1)

In [70]:
st_data = pd.get_dummies(st_data, columns=['community_area'])

In [72]:
st_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 618 entries, 0 to 617
Data columns (total 51 columns):
id                                       618 non-null int64
lon_ave                                  618 non-null float64
lat_ave                                  618 non-null float64
dp_max                                   618 non-null float64
community_area_ALBANY_PARK               618 non-null uint8
community_area_ARMOUR_SQUARE             618 non-null uint8
community_area_AUSTIN                    618 non-null uint8
community_area_AVALON_PARK               618 non-null uint8
community_area_AVONDALE                  618 non-null uint8
community_area_BRIDGEPORT                618 non-null uint8
community_area_CHATHAM                   618 non-null uint8
community_area_DOUGLAS                   618 non-null uint8
community_area_EAST_GARFIELD_PARK        618 non-null uint8
community_area_EDGEWATER                 618 non-null uint8
community_area_ENGLEWOOD                 618 non-

## Save data
Save to both feather and pickle files and compare the speed.

In [73]:
# Save to feather
st_data.to_feather('data/model_v1.0_template.feather')

In [74]:
# Save to pickle
st_data.to_pickle('data/model_v1.0_template.pk')

In [90]:
%%time
test1 = pd.read_feather('data/model_v1.0_template.feather')

CPU times: user 5.14 ms, sys: 2.45 ms, total: 7.59 ms
Wall time: 8.17 ms


In [89]:
%%time
test2 = pd.read_pickle('data/model_v1.0_template.pk')

CPU times: user 2.37 ms, sys: 1.07 ms, total: 3.44 ms
Wall time: 2.92 ms


Conclusion: pickle is faster than feather in this case

## Back up cells to check bike stations at different community areas

In [46]:
import folium
from folium import GeoJson


check_map = folium.Map(location = [41.90, -87.64], zoom_start = 10)

for lat, long in zip(st_data.lat_ave, st_data.lon_ave):
    folium.CircleMarker([lat, long], color="blue", fill=True, radius=5, weight=2, fill_opacity=0.4).add_to(check_map)
GeoJson(ca,
           style_function=lambda feature: {
               'fillColor': 'red', 
               'color' : 'red',
               'weight' : 3,
               'fillOpacity' : 0.2,
        }
       ).add_to(check_map)
check_map