In [2]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import datetime
import csv
import os
from sklearn.metrics import r2_score, mean_squared_error
from IPython.display import display

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [3]:
from modules.prediction import load_all_data
from modules.prediction import precrime_train_test_split
from modules.prediction import load_splits
from modules.prediction import create_all_splits
from modules.prediction import sample_model
from modules.fancy_time_series import fancy_time_series_model
from modules.eval_model import eval_predictions

In [4]:
crime_data = load_all_data()
splits = load_splits()
train_test_data = create_all_splits(crime_data, splits)

  mask |= (ar1 == a)


In [5]:
X_train_fine, X_test_fine, y_train_fine, y_test_fine = train_test_data['fine']
X_train_coarse, X_test_coarse, y_train_coarse, y_test_coarse = train_test_data['coarse']
X_train_2016, X_test_2016, y_train_2016, y_test_2016 = train_test_data['2016']

In [6]:
def get_np_dates(X):
    y = np.array(X['COMPLAINT_YEAR']-1970, dtype='<M8[Y]')
    m = np.array(X['COMPLAINT_MONTH']-1, dtype='<m8[M]')
    d = np.array(X['COMPLAINT_DAY']-1, dtype='<m8[D]')
    mid_dates = y+m+d
    return pd.Series(mid_dates, index=X.index)

def get_one_decimal_date(y, m, d):
    y_np = np.array([y - 1970], dtype='<M8[Y]')
    m_np = np.array([m - 1], dtype='<m8[M]')
    d_np = np.array([d - 1], dtype='<m8[D]')

    return (y_np + m_np + d_np)[0]

def get_52_weeks_ago(y, m, d):
    today_np = get_one_decimal_date(y, m, d)
    lastyear_np = today_np - np.timedelta64(7 * 52, 'D')
    return lastyear_np

def get_4_weeks_ago(y, m, d):
    today_np = get_one_decimal_date(y, m, d)
    lastmonth_np = today_np - np.timedelta64(7 * 4, 'D')
    return lastmonth_np


In [7]:
X_all = pd.concat([X_train_fine, X_test_fine])
X_all['DECIMAL_DATE'] = get_np_dates(X_all)
y_all = pd.concat([y_train_fine, y_test_fine])
all_all = pd.merge(X_all, y_all, left_index=True, right_index=True)

y_pred = X_test_fine[[
    'COMPLAINT_YEAR',
    'COMPLAINT_MONTH',
    'COMPLAINT_DAY',
    'COMPLAINT_HOURGROUP',
    'ADDR_PCT_CD'
]].copy()

buckets = X_test_fine[[
    'COMPLAINT_YEAR',
    'COMPLAINT_MONTH',
    'COMPLAINT_DAY',
]].copy().drop_duplicates()
buckets['DECIMAL_DATE'] = get_np_dates(buckets)
preds = []
y_train_dvs = y_train_fine.select_dtypes(exclude=['object']).columns

In [8]:
comparison_fullyear = all_all[
    (all_all['DECIMAL_DATE'] >= get_52_weeks_ago(2015, 1, 1)) &
    (all_all['DECIMAL_DATE'] <  get_one_decimal_date(2015, 1, 1))
]
total_felonies_last_year = np.sum(comparison_fullyear[y_train_dvs].values)
comparison_lastmonth = all_all[
    (all_all['DECIMAL_DATE'] >= get_4_weeks_ago(
        2015, 1, 1,)) &
    (all_all['DECIMAL_DATE'] < get_one_decimal_date(
        2015, 1, 1))
]
total_felonies_last_month = np.sum(
    comparison_lastmonth[y_train_dvs].values
)

In [17]:
comparison_fullyear_bucketed = (comparison_fullyear.groupby([
    'COMPLAINT_DAYOFWEEK', 'COMPLAINT_HOURGROUP', 'ADDR_PCT_CD'
])[y_train_dvs].sum() * (total_felonies_last_month / (4 * total_felonies_last_year))).reset_index()

In [18]:
comparison_fullyear_bucketed[comparison_fullyear_bucketed['COMPLAINT_DAYOFWEEK'] == 0]

Unnamed: 0,COMPLAINT_DAYOFWEEK,COMPLAINT_HOURGROUP,ADDR_PCT_CD,Homicide,Rape,Robbery,FelonyAssault,Burglary,GrandLarceny,GrandLarcenyAuto,Fraud,Forgery,Arson,Drugs,Weapons,CriminalMischief,Other
0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.104994,0.017499,0.034998,0.000000,0.000000,0.000000,0.000000,0.017499,0.000000
1,0,0,5,0.000000,0.000000,0.034998,0.017499,0.034998,0.052497,0.017499,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017499
2,0,0,6,0.000000,0.000000,0.069996,0.069996,0.157491,0.227486,0.017499,0.000000,0.017499,0.000000,0.000000,0.017499,0.034998,0.034998
3,0,0,7,0.000000,0.017499,0.069996,0.104994,0.017499,0.104994,0.017499,0.017499,0.000000,0.000000,0.034998,0.000000,0.017499,0.017499
4,0,0,9,0.000000,0.034998,0.052497,0.052497,0.017499,0.332480,0.017499,0.000000,0.034998,0.000000,0.000000,0.000000,0.017499,0.017499
5,0,0,10,0.000000,0.000000,0.069996,0.052497,0.017499,0.244985,0.000000,0.104994,0.000000,0.017499,0.000000,0.000000,0.034998,0.017499
6,0,0,13,0.000000,0.000000,0.017499,0.052497,0.104994,0.314981,0.017499,0.069996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0,0,14,0.000000,0.000000,0.017499,0.052497,0.052497,0.244985,0.000000,0.052497,0.017499,0.000000,0.017499,0.017499,0.017499,0.034998
8,0,0,17,0.000000,0.000000,0.017499,0.000000,0.069996,0.157491,0.000000,0.017499,0.000000,0.000000,0.000000,0.017499,0.052497,0.000000
9,0,0,18,0.000000,0.000000,0.052497,0.034998,0.174990,0.227486,0.000000,0.034998,0.000000,0.000000,0.000000,0.000000,0.000000,0.034998


In [12]:
total_felonies_last_month

10535

In [None]:
for index, bucket in buckets.iterrows():
    comparison = all_all[
        (all_all['COMPLAINT_DAYOFWEEK'] == bucket['COMPLAINT_DAYOFWEEK']) &
        (all_all['COMPLAINT_HOURGROUP'] == bucket['COMPLAINT_HOURGROUP']) &
        (all_all['DECIMAL_DATE'] < (bucket['DECIMAL_DATE'] - 6/365)) &
        (all_all['DECIMAL_DATE'] > (bucket['DECIMAL_DATE'] - 37/365))
    ]
    pred = comparison.groupby('ADDR_PCT_CD')[y_train_dvs].mean()
    pred.reset_index(inplace=True)
    for fld in [
        'COMPLAINT_YEAR',
        'COMPLAINT_MONTH',
        'COMPLAINT_DAY',
        'COMPLAINT_HOURGROUP',
    ]:
        pred[fld] = bucket[fld]
    preds.append(pred)
all_preds = pd.concat(preds)

In [21]:
y_ts_fine = fancy_time_series_model(X_train_fine, y_train_fine, X_test_fine, y_test_fine)
y_ts_coarse = fancy_time_series_model(X_train_coarse, y_train_coarse, X_test_coarse, y_test_coarse)
y_ts_2016 = fancy_time_series_model(X_train_2016, y_train_2016, X_test_2016, y_test_2016)

In [22]:
eval_predictions(X_test_fine, y_test_fine, y_ts_fine)

------------------------------------------------------------------
Four-hour buckets:
------------------------------------------------------------------
Homicide:         R2 =     -2.1, RMSE =     0.055, RMSE (%) =  1977.185
Rape:             R2 =     -1.6, RMSE =     0.123, RMSE (%) =   884.016
Robbery:          R2 =      3.2, RMSE =     0.352, RMSE (%) =   304.105
FelonyAssault:    R2 =      4.0, RMSE =     0.367, RMSE (%) =   334.084
Burglary:         R2 =      2.6, RMSE =     0.348, RMSE (%) =   315.924
GrandLarceny:     R2 =     15.2, RMSE =     0.516, RMSE (%) =   205.752
GrandLarcenyAuto: R2 =      2.4, RMSE =     0.249, RMSE (%) =   422.572
Fraud:            R2 =      0.8, RMSE =     0.185, RMSE (%) =   578.384
Forgery:          R2 =      2.0, RMSE =     0.184, RMSE (%) =   618.552
Arson:            R2 =     -2.1, RMSE =     0.095, RMSE (%) =  1197.076
Drugs:            R2 =      4.2, RMSE =     0.200, RMSE (%) =   556.938
Weapons:          R2 =      2.4, RMSE =     0.177, RMSE

In [23]:
eval_predictions(X_test_coarse, y_test_coarse, y_ts_coarse)

------------------------------------------------------------------
Four-hour buckets:
------------------------------------------------------------------
Homicide:         R2 =     -1.7, RMSE =     0.050, RMSE (%) =  2114.411
Rape:             R2 =     -1.2, RMSE =     0.123, RMSE (%) =   868.470
Robbery:          R2 =      3.9, RMSE =     0.347, RMSE (%) =   304.614
FelonyAssault:    R2 =      5.2, RMSE =     0.377, RMSE (%) =   321.838
Burglary:         R2 =      2.0, RMSE =     0.333, RMSE (%) =   325.360
GrandLarceny:     R2 =     14.7, RMSE =     0.524, RMSE (%) =   201.744
GrandLarcenyAuto: R2 =      2.5, RMSE =     0.235, RMSE (%) =   441.849
Fraud:            R2 =      0.3, RMSE =     0.177, RMSE (%) =   601.274
Forgery:          R2 =      1.7, RMSE =     0.177, RMSE (%) =   630.242
Arson:            R2 =     -1.5, RMSE =     0.091, RMSE (%) =  1240.591
Drugs:            R2 =      4.0, RMSE =     0.187, RMSE (%) =   579.150
Weapons:          R2 =      3.7, RMSE =     0.183, RMSE

In [24]:
eval_predictions(X_test_2016, y_test_2016, y_ts_2016)

------------------------------------------------------------------
Four-hour buckets:
------------------------------------------------------------------
Homicide:         R2 =     -1.8, RMSE =     0.045, RMSE (%) =  2403.798
Rape:             R2 =     -1.4, RMSE =     0.114, RMSE (%) =   912.278
Robbery:          R2 =      3.2, RMSE =     0.311, RMSE (%) =   339.816
FelonyAssault:    R2 =      4.8, RMSE =     0.381, RMSE (%) =   311.632
Burglary:         R2 =      0.0, RMSE =     0.286, RMSE (%) =   376.725
GrandLarceny:     R2 =     12.2, RMSE =     0.515, RMSE (%) =   206.721
GrandLarcenyAuto: R2 =      0.4, RMSE =     0.196, RMSE (%) =   529.180
Fraud:            R2 =      0.1, RMSE =     0.156, RMSE (%) =   669.338
Forgery:          R2 =      3.1, RMSE =     0.198, RMSE (%) =   556.387
Arson:            R2 =     -1.7, RMSE =     0.071, RMSE (%) =  1618.622
Drugs:            R2 =      1.7, RMSE =     0.175, RMSE (%) =   624.060
Weapons:          R2 =      4.1, RMSE =     0.184, RMSE