In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

%matplotlib inline
pd.set_option('display.max_rows', 200)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from modules.nypd_data import read_orig_file
from modules.nypd_data import filter_raw_data
from modules.nypd_data import save_dated_felonies
from modules.nypd_data import load_dated_felonies
from modules.nypd_data import save_clean_felonies
from modules.nypd_data import load_clean_felonies
from modules.nypd_data import add_offense_category
from modules.nypd_data import add_datetime_columns
from modules.nypd_data import pivot_felonies
from modules.nypd_data import save_pivoted_felonies
from modules.nypd_data import load_pivoted_felonies

In [3]:
orig_file = read_orig_file(None, 'NYPD_Complaint_Data_Current_YTD.csv')

In [4]:
filter_raw_data(orig_file, '../precrime_data/raw_recent_felonies.csv')

In [5]:
dated_felonies = load_dated_felonies(None, '../precrime_data/raw_recent_felonies.csv')

In [17]:
dated_felonies.to_csv('../precrime_data/clean_recent_felonies.csv', header=False)

In [8]:
clean_recent = load_clean_felonies(None, '../precrime_data/clean_recent_felonies.csv')
clean_old = load_clean_felonies()

  mask |= (ar1 == a)


In [10]:
clean_old.head()

Unnamed: 0_level_0,COMPLAINT_DATETIME,REPORT_DATE,KY_CD,OFNS_DESC,BORO_NM,ADDR_PCT_CD,Latitude,Longitude
CMPLNT_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
318229393,2006-01-02 00:00:00,2006-01-02,109,GRAND LARCENY,MANHATTAN,18,40.759529,-73.984397
554420424,2006-01-02 00:00:00,2008-11-26,112,THEFT-FRAUD,BROOKLYN,68,40.630222,-74.023848
218039893,2006-01-02 00:00:00,2006-11-26,109,GRAND LARCENY,MANHATTAN,6,40.733916,-74.000781
542037522,2006-01-02 00:00:00,2006-01-02,126,MISCELLANEOUS PENAL LAW,BROOKLYN,76,40.675672,-74.006256
492019346,2006-01-02 00:01:00,2006-01-07,109,GRAND LARCENY,BROOKLYN,70,40.634735,-73.952237


In [24]:
pd.merge(clean_old, clean_recent, left_index=True, right_index=True) # Making sure no duplicates

Unnamed: 0_level_0,COMPLAINT_DATETIME_x,REPORT_DATE_x,KY_CD_x,OFNS_DESC_x,BORO_NM_x,ADDR_PCT_CD_x,Latitude_x,Longitude_x,COMPLAINT_DATETIME_y,REPORT_DATE_y,KY_CD_y,OFNS_DESC_y,BORO_NM_y,ADDR_PCT_CD_y,Latitude_y,Longitude_y
CMPLNT_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [4]:
# At this point: I just appended the contents of clean_recent_felonies to the old clean_felonies file

nypd_data = load_clean_felonies()
add_offense_category(nypd_data)

  mask |= (ar1 == a)


In [19]:
nypd_data.pivot_table(
    index=[
        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.year),
        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.month),
#        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.day),
#        nypd_data['COMPLAINT_DATETIME'].map(lambda x: x.hour),
#        'BORO_NM',
        'ADDR_PCT_CD',   # These are not duplicated across boros.
    ],
    values='KY_CD',
    columns='OFFENSE',
    fill_value=0,
    aggfunc=len
)

Unnamed: 0_level_0,Unnamed: 1_level_0,OFFENSE,Homicide,Rape,Robbery,FelonyAssault,Burglary,GrandLarceny,GrandLarcenyAuto,Fraud,Forgery,Arson,Drugs,Weapons,CriminalMischief,Other
COMPLAINT_DATETIME,COMPLAINT_DATETIME,ADDR_PCT_CD,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2006,1,1,0,1,5,7,23,125,5,4,6,1,0,0,8,3
2006,1,5,0,2,16,12,14,52,2,0,25,0,1,3,3,12
2006,1,6,0,0,16,5,27,96,2,3,6,0,7,1,10,2
2006,1,7,1,1,17,4,9,26,3,2,7,0,6,2,3,5
2006,1,9,0,0,17,8,19,63,8,5,3,1,4,3,7,7
2006,1,10,0,2,7,12,16,62,11,7,9,0,13,3,1,3
2006,1,13,0,2,31,10,35,130,5,25,10,1,4,1,6,5
2006,1,14,0,2,29,15,49,234,3,7,68,1,6,3,10,17
2006,1,17,1,2,7,6,21,86,3,6,5,0,0,0,10,4
2006,1,18,0,0,13,10,26,177,5,6,7,2,1,0,9,7


In [7]:
nypd_data[nypd_data['OFFENSE'] == 'Rape']

Unnamed: 0_level_0,COMPLAINT_DATETIME,REPORT_DATE,KY_CD,OFNS_DESC,BORO_NM,ADDR_PCT_CD,Latitude,Longitude,OFFENSE
CMPLNT_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
679636523,2006-01-02 07:00:00,2006-02-02,104,RAPE,BROOKLYN,73,,,Rape
999299719,2006-01-02 09:00:00,2006-07-27,116,SEX CRIMES,BRONX,52,,,Rape
183691926,2006-01-02 09:00:00,2006-01-04,116,SEX CRIMES,BROOKLYN,79,,,Rape
804000472,2006-01-02 15:00:00,2006-01-02,104,RAPE,BROOKLYN,83,,,Rape
141882369,2006-01-02 17:00:00,2006-01-12,104,RAPE,STATEN ISLAND,122,,,Rape
698798201,2006-01-02 20:40:00,2006-01-02,116,SEX CRIMES,QUEENS,104,,,Rape
237796058,2006-01-02 22:30:00,2006-01-03,116,SEX CRIMES,MANHATTAN,26,,,Rape
789270555,2006-01-03 00:01:00,2006-02-13,104,RAPE,BRONX,45,,,Rape
189609347,2006-01-03 00:01:00,2006-03-17,104,RAPE,QUEENS,114,,,Rape
367809694,2006-01-03 08:30:00,2006-02-06,116,SEX CRIMES,MANHATTAN,20,,,Rape


In [20]:
add_datetime_columns(nypd_data)

In [21]:
save_pivoted_felonies(nypd_data)

In [22]:
# Before running: execute "tar -xvf pivoted_felonies.tar.gz" from the root directory of the git repo
#
# To make the files from scratch (not necessary):
# save_pivoted_felonies(nypd_data)
nypd_pivoted = load_pivoted_felonies()

  mask |= (ar1 == a)


In [23]:
nypd_pivoted

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,COMPLAINT_DAYOFWEEK,Homicide,Rape,Robbery,FelonyAssault,Burglary,GrandLarceny,GrandLarcenyAuto,Fraud,Forgery,Arson,Drugs,Weapons,CriminalMischief,Other,COMPLAINT_IDS
COMPLAINT_YEAR,COMPLAINT_MONTH,COMPLAINT_DAY,COMPLAINT_HOURGROUP,ADDR_PCT_CD,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2006,1,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2006,1,2,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2006,1,2,0,6,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,218039893 236109607
2006,1,2,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2006,1,2,0,9,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,886457370
2006,1,2,0,10,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,974691933 697451116
2006,1,2,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2006,1,2,0,14,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,484715525
2006,1,2,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
2006,1,2,0,18,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,318229393 689081600
