In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

%matplotlib inline
pd.set_option('display.max_rows', 200)

In [2]:
from modules.nypd_data import read_orig_file
from modules.nypd_data import filter_raw_data
from modules.nypd_data import save_dated_felonies
from modules.nypd_data import load_dated_felonies
from modules.nypd_data import save_clean_felonies
from modules.nypd_data import load_clean_felonies
from modules.nypd_data import add_offense_category
from modules.nypd_data import add_datetime_columns
from modules.nypd_data import save_pivoted_felonies
from modules.nypd_data import load_pivoted_felonies

In [None]:
# Before running: execute "tar -xvf cf.tar.gz" from the root directory of the git repo
# copy clean_felonies.csv into that directory,
# and unzip it 
#
#
# To make the files from scratch (not necessary):
# save_dated_felonies()
# save_clean_felonies()


nypd_data = load_clean_felonies()
add_offense_category(nypd_data)

  mask |= (ar1 == a)


In [None]:
nypd_data.pivot_table(
    index=[
        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.year),
        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.month),
#        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.day),
#        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.hour),
#        'BORO_NM',
        'ADDR_PCT_CD',   # These are not duplicated across boros.
    ],
    values='KY_CD',
    columns='OFFENSE',
    fill_value=0,
    aggfunc=len
)

In [None]:
add_datetime_columns(nypd_data)


In [None]:
# Before running: execute "tar -xvf pivoted_felonies.tar.gz" from the root directory of the git repo
#
# To make the files from scratch (not necessary):
# save_pivoted_felonies(nypd_data)
nypd_pivoted = load_pivoted_felonies()

In [None]:
nypd_pivoted

In [None]:
pivoted_weekday_precinct = nypd_data.pivot_table(
    index=[
        'COMPLAINT_DAYOFWEEK',   
        'COMPLAINT_HOURGROUP'
    ],
    values='KY_CD',
    columns='ADDR_PCT_CD',
    fill_value=0,
    aggfunc=len
)

In [None]:
ax = pivoted_weekday_precinct.mean(axis=1).plot(kind='bar', figsize=(20,10))
ax.figure.savefig('mean_complaints_by_dayofweek_time.png')

In [None]:
ax = pivoted_weekday_precinct.mean(axis=0).plot(kind='bar', figsize=(20,10))
ax.figure.savefig('mean_complaints_by_precinct.png')

In [None]:
summed_weekday_precinct = nypd_data.pivot_table(
    index=[
        'COMPLAINT_DAYOFWEEK',   
        'COMPLAINT_HOURGROUP',
        'ADDR_PCT_CD',
    ],
    values='KY_CD',
    fill_value=0,
    aggfunc=len
).reset_index()

summed_weekday_precinct['COMPLAINT_DAY_HOUR'] = \
    summed_weekday_precinct['COMPLAINT_DAYOFWEEK'].astype(str) + \
    '_' + \
    summed_weekday_precinct['COMPLAINT_HOURGROUP'].astype(str)

In [None]:
def eval_ridge(X_train, X_test, y_train, y_test):
    ridge = Ridge()
    ridge.fit(X_train, y_train)
    print('R2 (log space):', ridge.score(X_test, y_test))
    print('R2 (actual space):', r2_score(np.exp(y_test), np.exp(ridge.predict(X_test))))
    print('Baseline: ', np.exp(ridge.intercept_))
    for col, val in zip(X.columns, ridge.coef_):
        print('{0: <30}'.format(col), '{0:.0f}%'.format(100*(-1+np.exp(val))))
    return ridge

In [None]:
# Results using only weekday + time of day

X = pd.get_dummies(summed_weekday_precinct[['COMPLAINT_DAY_HOUR']].astype(str))
y = np.log(1 + summed_weekday_precinct['KY_CD'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=4800)
eval_ridge(X_train, X_test, y_train, y_test)

In [None]:
# Results using only precinct code

X = pd.get_dummies(summed_weekday_precinct[['ADDR_PCT_CD']].astype(str))
y = np.log(1 + summed_weekday_precinct['KY_CD'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=4800)
eval_ridge(X_train, X_test, y_train, y_test)

In [None]:
# Results using both precinct code and weekday + time of day

X = pd.get_dummies(summed_weekday_precinct[['COMPLAINT_DAY_HOUR', 'ADDR_PCT_CD']].astype(str))
y = np.log(1 + summed_weekday_precinct['KY_CD'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=4800)
eval_ridge(X_train, X_test, y_train, y_test)