# Crime in Chicago

The objective of this project is to predict whether a person who committed a particular crime was arrested.

In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine  
import pickle
import feather
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline
import xgboost
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
import joblib
from joblib import dump, load

In [3]:
%reload_ext autoreload
%autoreload 2

In [4]:
from fastai.tabular import * 

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# This is the local data path
#data_path = '/Users/kevin/Downloads/Crimes_-_2001_to_paresent.csv'

In [3]:
weather_data_path = '/Users/kevin/Downloads/1598904.csv'

In [41]:
iucr_codes = '/Users/kevin/Downloads/Chicago_Police_Department_-_Illinois_Uniform_Crime_Reporting__IUCR__Codes.csv'

/Users/kevin/Downloads/Chicago_Police_Department_-_Illinois_Uniform_Crime_Reporting__IUCR__Codes.csv

CREATE TABLE IF NOT EXISTS ChicagoCrime (
        ID integer,
        CaseNumber varchar(20),
        Date varchar(50),
        Block varchar(50),
        IUCR varchar(10),
        PrimaryType varchar(50),
        Description varchar(100),
        LocationDescription varchar(150),
        Arrest varchar(10),
        Domestic varchar(10),
        Beat integer,
        District real,
        Ward real,
        CommunityArea real,
        FBICode varchar(10),
        XCoordinate varchar(20),
        YCoordinate varchar(20),
        Year integer,
        UpdatedOn varchar(50),
        Latitude varchar(15),
        Longitude varchar(15),
        Location varchar(50)
    );


## Pull Data From Server

In [13]:
cnx = create_engine('postgresql://ubuntu:@54.91.118.64:5432/chicago')
df = pd.read_sql_query('''SELECT * FROM chicagocrime''', cnx)

In [38]:
df['datetime'] = pd.to_datetime(df['date'], infer_datetime_format=True)
mask = df['arrest']  == 'true'
print('Percent of Crimes ending in Arrest: ' + len(df[mask])/len(df)

In [42]:
#df.to_feather('chicago_crime.feather')
df = feather.read_dataframe('chicago_crime')

You have the IUCR codes so pull in the description for those codes in your downloads

Can check Chicago Economy for more data

## Read in Weather Data

In [97]:
df_weather = pd.read_csv(weather_data_path)

WT03 - Thunder  
WT04 - Ice pellets, sleet, snow pellets, or small hail"  
PRCP - Precipitation  
WT05 - Hail (may include small hail)  
WV03 - Thunder  
WT06 - Glaze or rime   
WT07 - Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction  
WT08 - Smoke or haze   
SNWD - Snow depth  
WT09 - Blowing or drifting snow  
WDF2 - Direction of fastest 2-minute wind  
WDF5 - Direction of fastest 5-second wind  
PGTM - Peak gust time  
WT11 - High or damaging winds  
TMAX - Maximum temperature  
WT13 - Mist  
WSF2 - Fastest 2-minute wind speed  
FMTM - Time of fastest mile or fastest 1-minute wind  
WSF5 - Fastest 5-second wind speed  
SNOW - Snowfall  
WT14 - Drizzle  
WT15 - Freezing drizzle   
WT16 - Rain (may include freezing rain, drizzle, and freezing drizzle)"   
WT17 - Freezing rain   
WT18 - Snow, snow pellets, snow grains, or ice crystals  
WT19 - Unknown source of precipitation   
AWND - Average wind speed  
WT21 - Ground fog  
WT22 - Ice fog or freezing fog  
WV20 - Rain or snow shower  
WT01 - Fog, ice fog, or freezing fog (may include heavy fog)  
WESD - Water equivalent of snow on the ground  
WT02 - Heavy fog or heaving freezing fog (not always distinguished from fog)  
TAVG - Average Temperature.  
TMIN - Minimum temperature  
TSUN - Total sunshine for the period  

In [98]:
df_weather.columns = map(str.lower, df_weather.columns)
df_weather['datetime'] = pd.to_datetime(df_weather['date'], infer_datetime_format=True)

## Merge Weather Data and Crime Data

In [100]:
df_weather = df_weather.sort_values('datetime')
df = df.sort_values('datetime')
cw_df = pd.merge_asof(df, df_weather, on = 'datetime', direction = 'backward', tolerance = pd.Timedelta('1 day')) 
df = cw_df.reset_index()
#df.to_feather('chicago_crime_and_weather.feather')

## Fastai (Keep in to run with more categorical variables)

In [6]:
df = feather.read_dataframe('chicago_crime_and_weather.feather')

In [70]:
droplist = ['id', 'casenumber', 'date_x', 'block', 'updatedon', 'station', 'name', 'date_y', 'index', 'xcoordinate', 'ycoordinate', 'year', 'location', 'datetime']

In [10]:
df = df.drop(droplist,axis=1)

In [63]:
df['latitude'] = df['latitude'].astype('float64', errors = 'ignore')
df['longitude'] = df['longitude'].astype('float64', errors = 'ignore')

In [52]:
categorical_list = []

for i in df.columns:
    
    try:
        #print(i, len(df[f'{i}'].unique()))
        if len(df[f'{i}'].unique()) < 10000:
            categorical_list.append(i)
    except:
        i, print('NaNs')

In [71]:
path = '/home/kevin/Metis/Projects/'
procs = [FillMissing, Categorify, Normalize]
valid_idx = range(len(df)-1500000, len(df))
dep_var = 'arrest'
cat_names = categorical_list

In [78]:
data = TabularDataBunch.from_df(path, df, dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)
print(data.train_ds.cont_names)  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}


['longitude', 'latitude']


In [79]:
(cat_x,cont_x),y = next(iter(data.train_dl))
for o in (cat_x, cont_x, y): print(to_np(o[:5]))

[[140   8 147 126 ...   0   0   1   1]
 [274  18 271 155 ...   0   0   1   1]
 [248  22 240 126 ...   0   0   1   1]
 [ 18  29  54 143 ...   0   0   1   1]
 [252  18 264 146 ...   0   0   1   1]]
[[ 0.164973 -1.281818]
 [-0.369244  0.393115]
 [ 2.245295 -1.502459]
 [-1.696505  1.147466]
 [ 0.241849 -1.029409]]
[0 1 0 0 1]


In [83]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,0.000000,0.000000,1.000000


## View the Entire Dataset

In [27]:
def display_all(df):
    with pd.option_context("display.max_rows", 100, "display.max_columns", 500): 
        display(df)

In [30]:
df.columns

Index(['index', 'id', 'casenumber', 'date_x', 'block', 'iucr', 'primarytype',
       'description', 'locationdescription', 'arrest', 'domestic', 'beat',
       'district', 'ward', 'communityarea', 'fbicode', 'xcoordinate',
       'ycoordinate', 'year', 'updatedon', 'latitude', 'longitude', 'location',
       'datetime', 'station', 'name', 'date_y', 'awnd', 'fmtm', 'pgtm', 'prcp',
       'snow', 'snwd', 'tavg', 'tmax', 'tmin', 'tsun', 'wdf2', 'wdf5', 'wesd',
       'wsf2', 'wsf5', 'wt01', 'wt02', 'wt03', 'wt04', 'wt05', 'wt06', 'wt07',
       'wt08', 'wt09', 'wt11', 'wt13', 'wt14', 'wt15', 'wt16', 'wt17', 'wt18',
       'wt19', 'wt21', 'wt22', 'wv03', 'wv20'],
      dtype='object')

In [28]:
display_all(df)

Unnamed: 0,index,id,casenumber,date_x,block,iucr,primarytype,description,locationdescription,arrest,domestic,beat,district,ward,communityarea,fbicode,xcoordinate,ycoordinate,year,updatedon,latitude,longitude,location,datetime,station,name,date_y,awnd,fmtm,pgtm,prcp,snow,snwd,tavg,tmax,tmin,tsun,wdf2,wdf5,wesd,wsf2,wsf5,wt01,wt02,wt03,wt04,wt05,wt06,wt07,wt08,wt09,wt11,wt13,wt14,wt15,wt16,wt17,wt18,wt19,wt21,wt22,wv03,wv20
0,0,10473864,HZ213356,01/01/2001 12:00:00 AM,012XX S DAMEN AVE,1582,OFFENSE INVOLVING CHILDREN,CHILD PORNOGRAPHY,OTHER,false,false,1233,12.0,2.0,28.0,17,,,2001,04/09/2016 03:47:49 PM,,,,2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
1,1,4071818,HL415251,01/01/2001 12:00:00 AM,098XX S DR MARTIN LUTHER KING JR DR,1754,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,false,false,511,5.0,6.0,49.0,02,1180604,1839894,2001,03/31/2006 10:03:38 PM,41.715923338,-87.614138393,"(41.715923338, -87.614138393)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
2,2,7208412,HR623380,01/01/2001 12:00:00 AM,003XX W 105TH PL,1754,OFFENSE INVOLVING CHILDREN,AGG SEX ASSLT OF CHILD FAM MBR,RESIDENCE,false,false,512,5.0,34.0,49.0,02,1175689,1835000,2001,11/08/2009 01:05:50 AM,41.702604761,-87.632285405,"(41.702604761, -87.632285405)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
3,3,1316070,G007383,01/01/2001 12:00:00 AM,041XX W 24 PL,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,false,false,1013,10.0,,,26,1149237,1887440,2001,08/17/2015 03:03:40 PM,41.84705874,-87.7277947,"(41.84705874, -87.7277947)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
4,4,4803028,HM414773,01/01/2001 12:00:00 AM,055XX S TRIPP AVE,1753,OFFENSE INVOLVING CHILDREN,SEX ASSLT OF CHILD BY FAM MBR,RESIDENCE,true,false,813,8.0,13.0,62.0,02,1149024,1867199,2001,10/03/2006 05:10:58 AM,41.791518681,-87.729098793,"(41.791518681, -87.729098793)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
5,5,7277327,HR690758,01/01/2001 12:00:00 AM,023XX S SACRAMENTO AVE,0841,THEFT,FINANCIAL ID THEFT:$300 &UNDER,RESIDENCE,false,false,1033,10.0,12.0,30.0,06,1156774,1888069,2001,01/01/2010 01:04:39 AM,41.848635623,-87.700116823,"(41.848635623, -87.700116823)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
6,6,2429863,HH749904,01/01/2001 12:00:00 AM,095XX S WESTERN AVE,1120,DECEPTIVE PRACTICE,FORGERY,COMMERCIAL / BUSINESS OFFICE,false,false,2213,22.0,19.0,72.0,10,1162090,1841396,2001,08/17/2015 03:03:40 PM,41.720449455,-87.681904417,"(41.720449455, -87.681904417)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
7,7,2443081,HH766202,01/01/2001 12:00:00 AM,049XX S WOODLAWN AVE,0810,THEFT,OVER $500,RESIDENCE,false,false,2124,2.0,4.0,39.0,06,1185031,1872477,2001,08/17/2015 03:03:40 PM,41.805231607,-87.596903083,"(41.805231607, -87.596903083)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
8,8,1362018,G067736,01/01/2001 12:00:00 AM,012XX N SPRINGFIELD AV,0560,ASSAULT,SIMPLE,RESIDENCE,false,false,2535,25.0,,,08A,1150113,1908050,2001,08/17/2015 03:03:40 PM,41.903597909,-87.724042825,"(41.903597909, -87.724042825)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,
9,9,7331615,HS135960,01/01/2001 12:00:00 AM,106XX S EDBROOKE AVE,0840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,false,false,512,5.0,9.0,49.0,06,1179186,1834330,2001,02/12/2010 01:26:47 AM,41.700687359,-87.619500705,"(41.700687359, -87.619500705)",2001-01-01 00:00:00,USW00094846,"CHICAGO OHARE INTERNATIONAL AIRPORT, IL US",2001-01-01,7.61,1443.0,1504.0,0.00,0.0,17.0,15.0,24,5,,320,20.0,0.0,14.1,17.9,,,,,,,,,,,,,,,,,,,,,


In [None]:
droplist = []

In [112]:
df = df.drop('index', axis = 1)
df = df.drop('casenumber', axis = 1)
df = df.drop('id', axis = 1)
df = df.drop('block', axis = 1)
df = df.drop('station', axis = 1)
df = df.drop('fmtm', axis = 1)
df = df.drop('pgtm', axis = 1)
df = df.drop('snwd', axis = 1)
df = df.drop('xcoordinate', axis = 1)
df = df.drop('ycoordinate', axis = 1)
df = df.drop('datetime', axis = 1)
df = df.drop('tavg', axis = 1)
df = df.drop('date_y', axis = 1)
df = df.drop('iucr', axis = 1)
df = df.drop('name', axis = 1)
df = df.drop('year', axis = 1)
df = df.drop('updatedon', axis = 1)
df = df.drop('location', axis = 1)
df = df.drop(['wdf2', 'wdf5', 'wesd', 'wsf2', 'wsf5', 'wt01',
       'wt02', 'wt03', 'wt04', 'wt05', 'wt06', 'wt07', 'wt08', 'wt09', 'wt11',
       'wt13', 'wt14', 'wt15', 'wt16', 'wt17', 'wt18', 'wt19', 'wt21', 'wt22',
       'wv03', 'wv20', 'tsun'], axis = 1)

df['primarytype'] = df['primarytype'].astype('category')
df['description'] = df['description'].astype('category')
df['locationdescription'] = df['locationdescription'].astype('category')
#df['arrest'] = df['arrest'].astype('int')
df['arrest'].replace('true', 1, inplace = True)
df['arrest'].replace('false', 0, inplace = True)
#df['domestic'] = df['domestic'].astype('bool')
df['domestic'].replace('true', 1, inplace = True)
df['domestic'].replace('false', 0, inplace = True)
df['fbicode'] = df['fbicode'].astype('category')
df = df.drop('date_x', axis = 1)
df['xcoordinate'] = df['xcoordinate'].fillna(value=np.nan)
df['xcoordinate'] = df['xcoordinate'].astype('int64', errors = 'ignore')
df['ycoordinate'] = df['ycoordinate'].fillna(value=np.nan)
df['ycoordinate'] = df['ycoordinate'].astype('int64', errors = 'ignore')
df['latitude'] = df['latitude'].astype('float64', errors = 'ignore')
df['longitude'] = df['longitude'].astype('float64', errors = 'ignore')
df['station'] = df['station'].astype('category')

In [None]:
df.to_feather('chicago_crime_cleaned.feather')
df = feather.read_dataframe('chicago_crime_cleaned.feather')

Reluctant drops

In [223]:
df = df.dropna(subset=['district'])
df = df.dropna(subset=['latitude'])
df['locationdescription'] = df.locationdescription.fillna(value='OTHER')
df['communityarea'] = df.sort_values(by=['beat', 'district', 'ward'])['communityarea'].fillna(method='ffill')
df['ward'] = df.sort_values(by=['beat', 'district', 'communityarea'])['ward'].fillna(method='ffill')

In [None]:
for header in df.columns:
    
    nulls_count = df[f'{header}'].isnull().sum()
    
    print(f'There are {nulls_count} in {header}')

## EDA

In [248]:
df = df.reset_index()

In [249]:
#df.to_feather('chicago_crime_final.feather')

In [3]:
df = feather.read_dataframe('chicago_crime_final.feather')

  labels, = index.labels


In [4]:
df = df.drop('fbicode', axis = 1)


## Fast.ai

In [6]:
df = df[0:5000000]
df_holdout = df[5000001:]

In [7]:
path = '/home/kevin/Metis/Projects/'

In [11]:
procs = [FillMissing, Categorify, Normalize]

In [12]:
valid_idx = range(len(df)-1500000, len(df))

In [13]:
dep_var = 'arrest'
cat_names = ['primarytype', 'description', 'locationdescription', 'domestic', 'district', 'communityarea',]


In [14]:
data = TabularDataBunch.from_df(path, df, dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)
print(data.train_ds.cont_names)  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}


['prcp', 'beat', 'snow', 'tmax', 'longitude', 'latitude', 'tmin', 'ward', 'index', 'awnd']


In [15]:
(cat_x,cont_x),y = next(iter(data.train_dl))
for o in (cat_x, cont_x, y): print(to_np(o[:5]))

[[ 18  88 128   1  12  24]
 [  2 291 151   1   9  62]
 [ 31  54 142   1   9  59]
 [ 34   2 158   1  14  26]
 [ 19 282 158   1  15  16]]
[[ 1.132383  0.001912 -0.156108  0.906265 -0.082284  0.394733  1.318209  0.251749 -1.428131 -0.502583]
 [-0.330532 -0.388702 -0.156108 -0.01469   0.468533 -0.388417  0.160829 -0.831463  0.06903  -0.305798]
 [-0.330532 -0.419838 -0.156108 -1.711187 -0.283599 -0.21096  -1.272118 -0.759248  1.44853   0.678129]
 [-0.330532  0.457629 -0.156108  1.003207 -1.250172  0.397458  1.373322  0.396178  1.725879 -0.241182]
 [-0.330532  0.599156 -0.156108 -0.596346 -1.63598   1.124255 -0.445418  1.118319  0.546514  1.400655]]
[0 0 0 0 1]


In [22]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit_one_cycle(2, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,0.304955,0.316658,0.889338
2,0.310646,0.343397,0.878885


In [19]:
learn.get_preds()

[tensor([[0.0904, 0.9096],
         [0.8049, 0.1951],
         [0.8069, 0.1931],
         ...,
         [0.8192, 0.1808],
         [0.0354, 0.9646],
         [0.7359, 0.2641]]), tensor([1, 0, 0,  ..., 1, 1, 0])]

In [None]:
learn.predict(df.iloc[0])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6735796 entries, 0 to 6735795
Data columns (total 18 columns):
index                  int64
primarytype            category
description            category
locationdescription    category
arrest                 int64
domestic               int64
beat                   int64
district               float64
ward                   float64
communityarea          float64
fbicode                category
latitude               float64
longitude              float64
awnd                   float64
prcp                   float64
snow                   float64
tmax                   int64
tmin                   int64
dtypes: category(4), float64(8), int64(6)
memory usage: 758.0 MB


In [4]:
df.corr()

Unnamed: 0,index,arrest,domestic,beat,district,ward,communityarea,latitude,longitude,awnd,prcp,snow,tmax,tmin
index,1.0,-0.055044,0.04337,-0.035996,-0.004956,0.013127,0.004968,-0.005265,0.001056,0.036246,0.016684,0.020514,0.01734,0.038502
arrest,-0.055044,1.0,-0.069274,-0.015993,-0.01678,-0.015836,-0.008292,0.002096,-0.031477,0.001616,-0.009167,0.00233,-0.023662,-0.025416
domestic,0.04337,-0.069274,1.0,-0.041821,-0.038657,-0.050101,0.072056,-0.075669,0.004518,0.002332,0.002825,0.002082,0.004467,0.003772
beat,-0.035996,-0.015993,-0.041821,1.0,0.939092,0.635785,-0.506381,0.61265,-0.473687,-0.003126,-0.000468,0.000737,-0.002075,-0.002319
district,-0.004956,-0.01678,-0.038657,0.939092,1.0,0.68874,-0.499337,0.620597,-0.528367,-0.001122,-2.7e-05,0.000919,-0.001339,-0.00122
ward,0.013127,-0.015836,-0.050101,0.635785,0.68874,1.0,-0.532559,0.626385,-0.432463,5.9e-05,-1.1e-05,0.001221,-4.9e-05,0.000588
communityarea,0.004968,-0.008292,0.072056,-0.506381,-0.499337,-0.532559,1.0,-0.747118,0.240317,0.000821,0.001185,-0.000435,0.001802,0.001377
latitude,-0.005265,0.002096,-0.075669,0.61265,0.620597,0.626385,-0.747118,1.0,-0.410834,-2.8e-05,-0.000483,0.001649,-0.003313,-0.00263
longitude,0.001056,-0.031477,0.004518,-0.473687,-0.528367,-0.432463,0.240317,-0.410834,1.0,-0.002323,0.000345,-0.002221,0.007822,0.007829
awnd,0.036246,0.001616,0.002332,-0.003126,-0.001122,5.9e-05,0.000821,-2.8e-05,-0.002323,1.0,0.080271,0.099045,-0.250733,-0.215913


In [5]:
df = pd.concat([df, pd.get_dummies(df['primarytype'])], axis = 1)

In [6]:
df = df.drop('primarytype', axis = 1)

In [7]:
#df = pd.concat([df, pd.get_dummies(df['description'])], axis = 1)

In [8]:
df = df.drop('description', axis = 1)

In [9]:
df = pd.concat([df, pd.get_dummies(df['locationdescription'])], axis = 1)

KeyboardInterrupt: 

In [None]:
df = df.drop('locationdescription', axis = 1)

### Create an Evaluation Function

In [3]:
def evaluate_model(clf):
    
    train_preds = clf.predict(X_train)
    train_auc = roc_auc_score(y_train, train_preds)
    val_preds = clf.predict(X_val)
    val_auc = roc_auc_score(y_val, val_preds)
    test_preds = clf.predict(X_test)
    test_auc = roc_auc_score(y_test, test_preds)
    train_score = clf.score(X_train, y_train)
    val_score = clf.score(X_val, y_val)
    test_score = clf.score(X_test, y_test)
    confusion_mat = confusion_matrix(y_test, test_preds)
    
    return print(f"AUC for training set: {train_auc} \nAUC for validation set: {val_auc} \nAUC for test set: {test_auc} \nScore for training set: {train_score}\nScore for validation set: {val_score} \nScore for test set: {test_score} \nConfusion Matrix: \n{confusion_mat}")

## Create a Model

In [4]:
#df.to_feather('chicago_crime_model_data.feather')

In [12]:
df = feather.read_dataframe('chicago_crime_model_data.feather')

  labels, = index.labels


In [3]:
#df = pd.read_pickle('model_df.p')

In [4]:
#df = df[0:1000000]

In [13]:
df = df.drop(['index', 'domestic', 'beat', 'district', 'ward', 'communityarea'], axis = 1)

In [14]:
y = df['arrest']
X = df.drop('arrest', axis = 1)

In [15]:
#Test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
#Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [None]:
learn = tabular_learner(df, layers=[200,100], emb_szs={'native-country': 10}, metrics=accuracy)
learn.fit_one_cycle(1, 1e-2)

### Logistic Regression

In [19]:
clf_logistic_pipeline = Pipeline([('scale_train', StandardScaler()),  ('lr', LogisticRegression())])

In [20]:
clf_logistic_pipeline.fit(X_train, y_train)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('scale_train', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [21]:
evaluate_model(clf_logistic_pipeline)

  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)
  Xt = transform.transform(Xt)


AUC for training set: 0.7819176581804969 
AUC for validation set: 0.7817091792999603 
AUC for test set: 0.7818304472777092 
Score for training set: 0.8653519925078904
Score for validation set: 0.8651978977998159 
Score for test set: 0.8653141251789568 
Confusion Matrix: 
[[1415779   45369]
 [ 226796  332795]]


In [22]:
dump(clf_logistic_pipeline, 'clf_logistic_pipeline.joblib') 
#clf_logistic_pipeline = load('filename.joblib') 

['clf_logistic_pipeline.joblib']

In [29]:
# You should create function for this and put it in a class
importance_list = []


for tup in zip(X_train.columns, np.exp(clf_logistic_pipeline.named_steps['lr'].coef_[0])):
    
    importance_list.append(tup) 
    sorted_importance_list = sorted(importance_list, key=lambda tup: tup[1], reverse = True)
sorted_importance_list[0:10]

[('NARCOTICS', 6.4495205582757906),
 ('PROSTITUTION', 1.9409004467370199),
 ('DEPARTMENT STORE', 1.3555689429902227),
 ('CRIMINAL TRESPASS', 1.3481573587031486),
 ('GAMBLING', 1.3177199360656247),
 ('GROCERY FOOD STORE', 1.3097096245166744),
 ('LIQUOR LAW VIOLATION', 1.2945964863184407),
 ('WEAPONS VIOLATION', 1.273162809482963),
 ('DRUG STORE', 1.1821135829524616),
 ('INTERFERENCE WITH PUBLIC OFFICER', 1.1708414586148312)]

In [30]:
# You should create function for this and put it in a class
importance_list = []


for tup in zip(X_train.columns, np.exp(clf_logistic_pipeline.named_steps['lr'].coef_[0])):
    
    importance_list.append(tup) 
    sorted_importance_list = sorted(importance_list, key=lambda tup: tup[1], reverse = False)
sorted_importance_list[0:10]

[('THEFT', 0.50994961136723405),
 ('CRIMINAL DAMAGE', 0.60289520151307407),
 ('BURGLARY', 0.67379653479959389),
 ('ROBBERY', 0.74421515295069274),
 ('MOTOR VEHICLE THEFT', 0.76248102164524201),
 ('RESIDENCE', 0.81641032362207477),
 ('DECEPTIVE PRACTICE', 0.82524604366744958),
 ('RESIDENCE-GARAGE', 0.907916971075862),
 ('OTHER OFFENSE', 0.91552311012662413),
 ('BATTERY', 0.91702718754638513)]

### Random Forest

In [5]:
set_rf_samples(1000000)
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)

NameError: name 'set_rf_samples' is not defined

In [10]:
clf_rf = RandomForestClassifier(n_estimators = 50, max_depth = 10, min_samples_leaf = 2, oob_score=True, n_jobs=-1)
clf_rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [38]:
evaluate_model(clf_rf)

AttributeError: module 'matplotlib.pyplot' has no attribute 'confusion_mat'

<Figure size 900x600 with 0 Axes>

In [12]:
clf_rf.oob_score_

0.84806602800330488

In [13]:
# You should create function for this and put it in a class
importance_list = []

for tup in zip(X_train.columns, clf_rf.feature_importances_):
    
    importance_list.append(tup) 
    sorted_importance_list = sorted(importance_list, key=lambda tup: tup[1], reverse = True)
sorted_importance_list[0:10]

[('NARCOTICS', 0.52285329801007996),
 ('CRIMINAL TRESPASS', 0.071447538463131677),
 ('THEFT', 0.06582852637974039),
 ('PROSTITUTION', 0.050390058821393059),
 ('SIDEWALK', 0.044579003890294074),
 ('CRIMINAL DAMAGE', 0.032731838878306781),
 ('WEAPONS VIOLATION', 0.030913909633955301),
 ('DEPARTMENT STORE', 0.017539930207047666),
 ('RESIDENCE', 0.017514975629403056),
 ('BURGLARY', 0.016004138556858317)]

In [18]:
dump(clf_rf, 'clf_rf.joblib') 
#clf_rd = load('filename.joblib') 

['clf_rf.joblib']

In [14]:
clf_rf.feature_importances_

array([  7.50803484e-03,   4.34079250e-03,   2.85846843e-04,
         1.91034707e-04,   1.15426243e-04,   4.30853152e-04,
         6.23031608e-04,   6.59636071e-05,   2.20711803e-03,
         1.04331357e-02,   1.60041386e-02,   1.04366599e-05,
         2.55052049e-04,   3.27318389e-02,   7.14475385e-02,
         1.91187174e-03,   0.00000000e+00,   7.04978735e-03,
         3.10347170e-04,   7.62484927e-08,   4.64566493e-03,
         8.64030317e-06,   5.21912691e-05,   8.05304215e-03,
         1.34523985e-02,   5.22853298e-01,   3.07590048e-07,
         5.52516225e-07,   0.00000000e+00,   3.03786299e-05,
         6.90121566e-05,   2.72216348e-06,   2.30679071e-03,
         5.03900588e-02,   4.67857858e-06,   4.65191196e-03,
         0.00000000e+00,   1.40561681e-02,   5.55527150e-05,
         1.02554892e-05,   6.58285264e-02,   3.09139096e-02,
         8.26099763e-05,   2.26671212e-06,   9.99074812e-08,
         3.09769398e-08,   2.49210649e-06,   3.53941617e-07,
         3.21642766e-09,

In [31]:
cm = confusion_matrix(y_test, preds)
plt.figure(dpi=150)
sns.heatmap(cm, cmap=plt.cm.Blues, annot=True, square=True, fmt = '.7g',
           xticklabels= ['No arrest', 'Arrest'],
           yticklabels= ['No Arrest', 'Arrest'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix');

NameError: name 'preds' is not defined

kernel SVM, KNN, Extra Trees

## Grid Search RF

In [31]:
rfc = RandomForestClassifier(n_jobs=-1)

In [32]:
parameters = {'n_estimators':[10,20,30], 'max_depth' : [3,7, 10, None], 'min_samples_leaf':[1,3,5,7]}

In [33]:
rfc_clf = GridSearchCV(rfc, parameters, cv=5)

In [None]:
%%time
rfc_clf.fit(X_train, y_train)

In [None]:
rfc_clf.best_params

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 30], 'max_depth': [3, 7, 10, None], 'min_samples_leaf': [1, 3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
rfb = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
rfb.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## KNN

In [11]:
clf_knn_pipeline = Pipeline([('scale_train', StandardScaler()),  ('lr', KNeighborsClassifier(n_neighbors=5, n_jobs=-1))])

In [12]:
clf_knn_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('scale_train', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform'))])

In [13]:
evaluate_model(clf_knn_pipeline)

AUC for training set: 0.8304842129492409 
AUC for validation set: 0.7900891238840703 
AUC for test set: 0.7887500002362207 
Score for training set: 0.8863387755102041
Score for validation set: 0.8525904761904762 
Score for test set: 0.8523133333333334 
Confusion Matrix: 
[[199987  12419]
 [ 31887  55707]]


In [14]:
pd.to_pickle(clf_knn_pipeline, 'knn_clf_pipeline.p')

In [50]:
nbrs = KNeighborsClassifier(n_neighbors=5)

In [None]:
%%time
nbrs.fit(X_train_scaled, y_train)

CPU times: user 18min 58s, sys: 1.53 s, total: 18min 59s
Wall time: 18min 57s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
evaluate_model(nbrs, X_train=X_train_scaled, X_val=X_val_scaled, X_test=X_test_scaled)

In [73]:
#%% time
train_preds = nbrs.predict(X_train)
roc_auc_score(y_train, train_preds)

0.81331357275038685

In [72]:
#test score
# n_neighbors=5 scores 0.7659
preds = nbrs.predict(X_val)
roc_auc_score(y_val, preds)

0.76138047840754519

In [74]:
confusion_matrix(y_val, preds)

array([[139939,   8366],
       [ 25963,  35732]])

## Gradient Boosting

In [21]:
gb_clf = GradientBoostingClassifier(learning_rate=0.01)
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [22]:
#training score
train_preds = gb_clf.predict(X_train)
roc_auc_score(y_train, train_preds)

0.7363564703752371

In [23]:
evaluate_model(gb_clf)

AUC for training set: 0.7363564703752371 
AUC for validation set: 0.7366049415361069 
AUC for test set: 0.7366194163182307 
Score for training set: 0.8500505523491769
Score for validation set: 0.8500096852779533 
Score for test set: 0.8501983680227877 
Confusion Matrix: 
[[1448281   12867]
 [ 289843  269748]]


In [66]:
gb_clf.score(X_val, y_val)

0.84116666666666662

In [24]:
'''
#test score
#standard random forest is 0.79
preds = gb_clf.predict(X_val)
roc_auc_score(y_val, preds)
'''

'\n#test score\n#standard random forest is 0.79\npreds = gb_clf.predict(X_val)\nroc_auc_score(y_val, preds)\n'

In [68]:
#confusion_matrix(y_val, preds)

array([[147445,    860],
       [ 32495,  29200]])

Try balancing the dataset first

## Balancing the Dataset

In [42]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_train,y_train)

In [45]:
# Yay, balanced classes!
len(y_resampled), len(X_resampled)

(693394, 693394)

In [53]:
rfb_balanced = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [55]:
rfb_balanced.fit(X_resampled, y_resampled)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [57]:
preds = rfb_balanced.predict(X_val)
roc_auc_score(y_val, preds)

0.79100646692502496

In [58]:
rfb_balanced.score(X_val, y_val)

0.849547619047619

In [60]:
rfb_balanced.score(X_test, y_test)

0.84939666666666669

In [62]:
confusion_matrix(y_val, preds)

array([[138361,   9944],
       [ 21651,  40044]])

## Ensemble of Several Models

In [27]:
model_list = [('lr', clf_logistic_pipeline), ('rf', clf_rf), ('gb', gb_clf)]

In [28]:
# create voting classifier
voting_classifer = VotingClassifier(estimators=model_list,
                                    voting='hard', #<-- sklearn calls this hard voting
                                    n_jobs=-1)
voting_classifer.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', Pipeline(memory=None,
     steps=[('scale_train', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, ...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=-1, voting='hard', weights=None)

In [30]:
evaluate_model(voting_classifer)

AUC for training set: 0.7437367358291241 
AUC for validation set: 0.7436991551717599 
AUC for test set: 0.743873117622716 
Score for training set: 0.8540023311343996
Score for validation set: 0.8538194635911314 
Score for test set: 0.8540613112331676 
Confusion Matrix: 
[[1447775   13373]
 [ 281531  278060]]


In [81]:
preds = voting_classifer.predict(X_val)
roc_auc_score(y_val, preds)

0.76105835637984365