In [1]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import datetime
import csv
import os
from sklearn.metrics import r2_score, mean_squared_error
from IPython.display import display

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
from modules.prediction import load_all_data
from modules.prediction import precrime_train_test_split
from modules.prediction import load_splits
from modules.prediction import create_all_splits
from modules.prediction import sample_model
from modules.fancy_time_series import fancy_time_series_model
from modules.eval_model import eval_predictions

In [4]:
crime_data = load_all_data()
splits = load_splits()
train_test_data = create_all_splits(crime_data, splits)

  mask |= (ar1 == a)


In [14]:
X_train_fine, X_test_fine, y_train_fine, y_test_fine = train_test_data['fine']
X_train_coarse, X_test_coarse, y_train_coarse, y_test_coarse = train_test_data['coarse']
X_train_2016, X_test_2016, y_train_2016, y_test_2016 = train_test_data['2016']

In [16]:
def get_np_dates(X):
    y = np.array(X['COMPLAINT_YEAR']-1970, dtype='<M8[Y]')
    m = np.array(X['COMPLAINT_MONTH']-1, dtype='<m8[M]')
    d = np.array(X['COMPLAINT_DAY']-1, dtype='<m8[D]')
    mid_dates = y+m+d
    return pd.Series(mid_dates, index=X.index)

def get_one_decimal_date(y, m, d):
    y_np = np.array([y - 1970], dtype='<M8[Y]')
    m_np = np.array([m - 1], dtype='<m8[M]')
    d_np = np.array([d - 1], dtype='<m8[D]')

    return (y_np + m_np + d_np)[0]

def get_52_weeks_ago(y, m, d):
    today_np = get_one_decimal_date(y, m, d)
    lastyear_np = today_np - np.timedelta64(7 * 52, 'D')
    return lastyear_np

In [19]:
X_all = pd.concat([X_train_fine, X_test_fine])
X_all['DECIMAL_DATE'] = get_np_dates(X_all)
y_all = pd.concat([y_train_fine, y_test_fine])
all_all = pd.merge(X_all, y_all, left_index=True, right_index=True)

y_pred = X_test_fine[[
    'COMPLAINT_YEAR',
    'COMPLAINT_MONTH',
    'COMPLAINT_DAY',
    'COMPLAINT_HOURGROUP',
    'ADDR_PCT_CD'
]].copy()

buckets = X_test_fine[[
    'COMPLAINT_YEAR',
    'COMPLAINT_MONTH',
    'COMPLAINT_DAY',
]].copy().drop_duplicates()
buckets['DECIMAL_DATE'] = get_np_dates(buckets)
preds = []
y_train_dvs = y_train_fine.select_dtypes(exclude=['object']).columns

In [33]:
comparison_fullyear = all_all[
    (all_all['DECIMAL_DATE'] >= get_52_weeks_ago(2015, 1, 1)) &
    (all_all['DECIMAL_DATE'] <  get_one_decimal_date(2015, 1, 1))
]
total_felonies_last_year = np.sum(comparison_fullyear[y_train_dvs].values)

In [37]:
comparison_fullyear_bucketed = comparison_fullyear.groupby([
    'COMPLAINT_DAYOFWEEK', 'COMPLAINT_HOURGROUP', 'ADDR_PCT_CD'
])[y_train_dvs].sum() / total_felonies_last_year

In [39]:
comparison_fullyear_bucketed

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Homicide,Rape,Robbery,FelonyAssault,Burglary,GrandLarceny,GrandLarcenyAuto,Fraud,Forgery,Arson,Drugs,Weapons,CriminalMischief,Other
COMPLAINT_DAYOFWEEK,COMPLAINT_HOURGROUP,ADDR_PCT_CD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000040,0.000007,0.000013,0.000000,0.000000,0.000000,0.000000,0.000007,0.000000
0,0,5,0.000000,0.000000,0.000013,0.000007,0.000013,0.000020,0.000007,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000007
0,0,6,0.000000,0.000000,0.000027,0.000027,0.000060,0.000086,0.000007,0.000000,0.000007,0.000000,0.000000,0.000007,0.000013,0.000013
0,0,7,0.000000,0.000007,0.000027,0.000040,0.000007,0.000040,0.000007,0.000007,0.000000,0.000000,0.000013,0.000000,0.000007,0.000007
0,0,9,0.000000,0.000013,0.000020,0.000020,0.000007,0.000126,0.000007,0.000000,0.000013,0.000000,0.000000,0.000000,0.000007,0.000007
0,0,10,0.000000,0.000000,0.000027,0.000020,0.000007,0.000093,0.000000,0.000040,0.000000,0.000007,0.000000,0.000000,0.000013,0.000007
0,0,13,0.000000,0.000000,0.000007,0.000020,0.000040,0.000120,0.000007,0.000027,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
0,0,14,0.000000,0.000000,0.000007,0.000020,0.000020,0.000093,0.000000,0.000020,0.000007,0.000000,0.000007,0.000007,0.000007,0.000013
0,0,17,0.000000,0.000000,0.000007,0.000000,0.000027,0.000060,0.000000,0.000007,0.000000,0.000000,0.000000,0.000007,0.000020,0.000000
0,0,18,0.000000,0.000000,0.000020,0.000013,0.000066,0.000086,0.000000,0.000013,0.000000,0.000000,0.000000,0.000000,0.000000,0.000013


In [None]:
for index, bucket in buckets.iterrows():
    comparison = all_all[
        (all_all['COMPLAINT_DAYOFWEEK'] == bucket['COMPLAINT_DAYOFWEEK']) &
        (all_all['COMPLAINT_HOURGROUP'] == bucket['COMPLAINT_HOURGROUP']) &
        (all_all['DECIMAL_DATE'] < (bucket['DECIMAL_DATE'] - 6/365)) &
        (all_all['DECIMAL_DATE'] > (bucket['DECIMAL_DATE'] - 37/365))
    ]
    pred = comparison.groupby('ADDR_PCT_CD')[y_train_dvs].mean()
    pred.reset_index(inplace=True)
    for fld in [
        'COMPLAINT_YEAR',
        'COMPLAINT_MONTH',
        'COMPLAINT_DAY',
        'COMPLAINT_HOURGROUP',
    ]:
        pred[fld] = bucket[fld]
    preds.append(pred)
all_preds = pd.concat(preds)

In [15]:
y_ts_fine = simple_time_series_model(X_train_fine, y_train_fine, X_test_fine, y_test_fine)
y_ts_coarse = simple_time_series_model(X_train_coarse, y_train_coarse, X_test_coarse, y_test_coarse)
y_ts_2016 = simple_time_series_model(X_train_2016, y_train_2016, X_test_2016, y_test_2016)

NameError: name 'simple_time_series_model' is not defined

In [6]:
eval_predictions(X_test_fine, y_test_fine, y_ts_fine)

------------------------------------------------------------------
Four-hour buckets:
------------------------------------------------------------------
Homicide:         R2 =    -20.0, RMSE =     0.060, RMSE (%) =  2143.289
Rape:             R2 =    -18.8, RMSE =     0.133, RMSE (%) =   956.153
Robbery:          R2 =    -13.2, RMSE =     0.381, RMSE (%) =   328.924
FelonyAssault:    R2 =    -13.0, RMSE =     0.398, RMSE (%) =   362.422
Burglary:         R2 =    -13.4, RMSE =     0.375, RMSE (%) =   340.812
GrandLarceny:     R2 =      0.7, RMSE =     0.559, RMSE (%) =   222.733
GrandLarcenyAuto: R2 =    -14.5, RMSE =     0.270, RMSE (%) =   457.720
Fraud:            R2 =    -15.6, RMSE =     0.199, RMSE (%) =   624.481
Forgery:          R2 =    -14.2, RMSE =     0.198, RMSE (%) =   667.583
Arson:            R2 =    -20.8, RMSE =     0.103, RMSE (%) =  1302.002
Drugs:            R2 =    -11.1, RMSE =     0.215, RMSE (%) =   599.847
Weapons:          R2 =    -14.5, RMSE =     0.192, RMSE

In [7]:
eval_predictions(X_test_coarse, y_test_coarse, y_ts_coarse)

------------------------------------------------------------------
Four-hour buckets:
------------------------------------------------------------------
Homicide:         R2 =    -18.8, RMSE =     0.055, RMSE (%) =  2284.914
Rape:             R2 =    -19.1, RMSE =     0.134, RMSE (%) =   941.800
Robbery:          R2 =    -12.5, RMSE =     0.376, RMSE (%) =   329.583
FelonyAssault:    R2 =    -11.3, RMSE =     0.408, RMSE (%) =   348.695
Burglary:         R2 =    -14.4, RMSE =     0.360, RMSE (%) =   351.528
GrandLarceny:     R2 =      0.5, RMSE =     0.566, RMSE (%) =   217.953
GrandLarcenyAuto: R2 =    -14.5, RMSE =     0.255, RMSE (%) =   478.932
Fraud:            R2 =    -16.4, RMSE =     0.191, RMSE (%) =   649.797
Forgery:          R2 =    -14.7, RMSE =     0.191, RMSE (%) =   680.949
Arson:            R2 =    -19.1, RMSE =     0.099, RMSE (%) =  1343.295
Drugs:            R2 =    -12.1, RMSE =     0.203, RMSE (%) =   625.704
Weapons:          R2 =    -12.5, RMSE =     0.198, RMSE

In [8]:
eval_predictions(X_test_2016, y_test_2016, y_ts_2016)

------------------------------------------------------------------
Four-hour buckets:
------------------------------------------------------------------
Homicide:         R2 =    -19.4, RMSE =     0.049, RMSE (%) =  2603.355
Rape:             R2 =    -19.8, RMSE =     0.124, RMSE (%) =   991.948
Robbery:          R2 =    -13.4, RMSE =     0.336, RMSE (%) =   367.911
FelonyAssault:    R2 =    -11.9, RMSE =     0.413, RMSE (%) =   337.855
Burglary:         R2 =    -17.4, RMSE =     0.310, RMSE (%) =   408.249
GrandLarceny:     R2 =     -3.1, RMSE =     0.558, RMSE (%) =   224.013
GrandLarcenyAuto: R2 =    -17.3, RMSE =     0.212, RMSE (%) =   574.197
Fraud:            R2 =    -17.6, RMSE =     0.169, RMSE (%) =   726.302
Forgery:          R2 =    -12.7, RMSE =     0.213, RMSE (%) =   600.130
Arson:            R2 =    -19.4, RMSE =     0.077, RMSE (%) =  1753.834
Drugs:            R2 =    -14.1, RMSE =     0.188, RMSE (%) =   672.322
Weapons:          R2 =    -12.3, RMSE =     0.199, RMSE