# Machine Learning Final Project: Police Search Data Analysis

## Part 1: Setup

In [69]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn import ensemble # for fitting our model
from sklearn import svm
import pandas as pd
import itertools
from sklearn import preprocessing
from sklearn import model_selection
import numpy as np
import os
import math
import datetime

pd.set_option('display.max_columns', 100)

## Part 2: Dataset Processing

In [70]:
### IGNORE THIS CELL
# Get pertinent info about the data in a column
category = 'CORRECTED_search_based_on'

print(raw_data[category].value_counts()[:28])
print(raw_data[category].nunique())

PROBABLE CAUSE    249
Name: CORRECTED_search_based_on, dtype: int64
1


In [71]:
# Create a copy of raw_data to process
data = raw_data

In [72]:
# Binarize sex, and whether race was known at the time of the stop, and whether or not the search was successful
data['search_success'] = 1-pd.get_dummies(data['CORRECTED_search_discovered'])['NOTHING']
data['isMale'] = pd.get_dummies(data['SEX'])['M']
data['race_known'] = pd.get_dummies(data['RACE_KNOWN'])['YES - RACE OR ETHNICITY WAS KNOWN BEFORE STOP']

In [73]:
# Normalize date and geolocation columns
scaler = preprocessing.StandardScaler()

data['X_COORDINATE'] = scaler.fit_transform(data['X_COORDINATE'].values.astype(float).reshape(-1,1))
data['Y_COORDINATE'] = scaler.fit_transform(data['Y_COORDINATE'].values.astype(float).reshape(-1,1))
data['CAD_date_time'] = scaler.fit_transform(pd.to_datetime(
    data['CAD_date_time']).values.astype(float).reshape(-1,1))
data['REP_DATE'] = scaler.fit_transform(pd.to_datetime(data['REP_DATE']).values.astype(float).reshape(-1,1))

In [74]:
# One-hot encode race, reason for stop, whether the suspect's race was known, and location information
data_onehotencoded = pd.get_dummies(data[['APD_RACE_DESC','REASON_FOR_STOP_DESC',
                                          'CAD_sector']],prefix='',prefix_sep='')

In [75]:
# Join one-hot encoded and regular data and drop unprocessed categorical columns, address data, and id data
data_processed = data.join(data_onehotencoded,how='outer').drop(['PRIMARY_KEY','APD_RACE_DESC',
            'CORRECTED_search_discovered','CORRECTED_search_based_on','REASON_FOR_STOP_DESC',
            'CouncilDistrict','CAD_sector','RACE_KNOWN','PERSON_SEARCHED_DESC', 'SEX','LOCATION'],axis=1)

In [76]:
#Export processed data to csv
data_processed.to_csv('RacialProfilingArrest_data_processed.csv')

## Part 3: Model Building and Validation

In [77]:
# Import data
data_processed = pd.read_csv('RacialProfilingArrest_data_processed.csv').drop('Unnamed: 0',axis=1)

data_processed.head()

Unnamed: 0,REP_DATE,CAD_date_time,X_COORDINATE,Y_COORDINATE,search_success,isMale,race_known,BLACK,HISPANIC OR LATINO,WHITE,CALL FOR SERVICE,CONSENSUAL CONTACT,MOTOR VEHICLE DRIVER,OTHER,PRE-EXISTING KNOWLEDGE,SUSPICIOUS PERSON / VEHICLE,VIOLATION OF CITY ORDIANCE,VIOLATION OF PENAL CODE,VIOLATION OF TRANSPORTATION CODE/VEHICLE LAWS,ADAM PD,BAKER,CHARLIE,DAVID,EDWARD,FRANK,GEORGE,HENR,IDA,OOC,TRAVIS PD
0,-1.586818,-1.588026,0.753332,2.003187,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
1,-1.586818,-1.588026,-0.722298,-0.977299,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2,-1.586818,-1.588026,0.018915,-1.402511,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
3,-1.586818,-1.588026,0.032796,1.896339,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,-1.586818,-1.588026,0.700682,-0.426675,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0


In [78]:
# Split and sample test and train data
X = data_processed[data_processed.columns.difference(['search_success'])].values
y = data_processed['search_success'].values

In [80]:
### Create a baseline model with which to test modifications to the net
def create_baseline():
    # create model
    model = Sequential()

    # The first layer of your network needs to know the input shape
#     print('Building model...')
    model = Sequential()
    model.add(Dense(29, input_dim=29, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

#     print('Compiling model...')
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

# Evaluate model using KFold Validation, and use binary_crossentropy loss function. 
# I chose to use accuracy as my scoring metric, because I wanted to minimize false positives
estimator = KerasClassifier(build_fn=create_baseline, epochs=25, batch_size=32, verbose=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(estimator, X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25


Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Results: 60.65% (1.71%)


### Model Tuning

In [82]:
# Try building a smaller neural net
def create_smaller():
    # create model
#     print('Building model...')
    model = Sequential()
    model.add(Dense(20, input_dim=29, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

#     print('Compiling model...')
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model
    
estimator = KerasClassifier(build_fn=create_smaller, epochs=30, batch_size=30, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(estimator, X, y, cv=kfold, verbose=0)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 59.45% (1.94%)


In [None]:
# Try building a bigger neural net-
def create_bigger():
    # create model
    #print('Building model...')
    model = Sequential()
    model.add(Dense(29, input_dim=29, kernel_initializer='normal', activation='relu'))
    model.add(Dense(14, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

    #print('Compiling model...')
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model
    
estimator = KerasClassifier(build_fn=create_bigger, epochs=30, batch_size=32, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X, y, cv=kfold,verbose=0)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [18]:
# Logistic Regression
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(linear_model.LogisticRegression(max_iter=10000,tol=0.00001,solver='liblinear'), X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 62.85% (4.03%)


In [25]:
# SVM

kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(svm.SVC(tol=.0001,verbose=0,kernel='linear',gamma='auto'), X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Results: 63.08% (3.20%)


In [20]:
svm.SVC?

In [21]:
# Random Forest
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(ensemble.RandomForestClassifier(verbose=0,n_estimators=100,max_features='log2'), X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


Results: 62.71% (4.96%)


In [46]:
cross_val_score?