# Predicting which Murders will be Unsolved using Murder Accountability Project data

### Data Load

In [40]:
import csv
import pandas as pd
import numpy as np
import codecs
import patsy

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.figure
%matplotlib inline

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import recall_score, classification_report, roc_curve, auc, \
accuracy_score,f1_score, confusion_matrix, precision_recall_curve

from sklearn.learning_curve import validation_curve

import random
import pickle

pd.set_option('display.max_columns', 50)

Reading in all of the data and then creating a 10% random sample to make a more manageable dataset.

In [41]:
doc = codecs.open('Year_all.csv','rU','UTF-16') 
dfall = pd.read_csv(doc, sep='\t')
dfsample = dfall.sample(frac=0.1, replace=True)
print(dfall.shape)
print(dfsample.shape)

(752313, 34)
(75231, 34)


### Preprocessing:    
Replace all unknows with NaN         
Remove unecssary columns.  This includes indices, file date, etc.       
The "offender" columns and relationship columns are removed because when predicting unsolved murders nothing is known about the offender.     

In [42]:
df = dfsample.replace(999, np.nan)
df = df.replace ("Unknown", np.nan)
df = df.replace ("Unknown or not reported", np.nan)
df = df.drop(['Action Type','Agency','Cntyfips','Calculation2', 'Circumstance','File Date', 'Homicide', 
              'ID', 'Incident' ,'Ori', 'Off Age','Off Count', 'Off Sex', 'Off Ethnic', 'Off Race', 
              'Relationship','Situation','Source','State Name','State', 'Vic Ethnic', 'Subcircum','Fstate', 
              'Number of Records'], axis=1)

Narrow down the MSAs by selecting the one with the most value counts and the ones contributing most to the prediction.               
    

In [43]:
df4['MSA'].value_counts()

New York-New Jersey-Long Island, NY-NJ          5896
Los Angeles-Long Beach, CA                      5403
Chicago-Naperville-Joliet, IL-IN-WI             3288
Detroit-Warren-Livonia, MI                      2398
Houston-Sugar Land-Baytown, TX                  2142
Philadelphia-Camden-Wilmington, PA-NJ-DE        2117
Dallas-Fort Worth-Arlington, TX                 1894
Miami-Fort Lauderdale, FL                       1750
Washington-Arlington-Alexandria, DC-VA-MD-WV    1465
San Francisco-Oakland-Fremont, CA               1387
Atlanta-Sandy Springs-Marietta, GA              1387
Baltimore-Towson, MD                            1280
New Orleans-Metairie-Kenner, LA                 1123
St. Louis, MO-IL                                1089
Phoenix-Mesa-Scottsdale, AZ                     1003
Riverside-San Bernardino, CA                     979
Rural North Carolina                             726
Rural Texas                                      708
Memphis, TN-MS-AR                             

In [44]:
df4= df[['MSA', 'Solved']]
df4 = df4.dropna()
msa=patsy.dmatrix('MSA',data=df4,return_type='dataframe')
allt = df4.join(msa)
allt= allt.dropna()

In [45]:
X = allt.iloc[:,3:]
y = allt['Solved']

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)

y_pred_train = randomforest.predict(X_train)
y_pred_test = randomforest.predict(X_test)

table = list(zip(X,randomforest.feature_importances_))
dftable = pd.DataFrame(table, columns=['msa','Coef'])
dftable.sort_values(by='Coef', ascending=False)

Unnamed: 0,msa,Coef
231,"MSA[T.New York-New Jersey-Long Island, NY-NJ]",0.160219
196,"MSA[T.Los Angeles-Long Beach, CA]",0.097195
390,"MSA[T.Washington-Arlington-Alexandria, DC-VA-M...",0.069587
62,"MSA[T.Chicago-Naperville-Joliet, IL-IN-WI]",0.051981
364,"MSA[T.St. Louis, MO-IL]",0.033412
23,"MSA[T.Baltimore-Towson, MD]",0.031381
337,"MSA[T.San Francisco-Oakland-Fremont, CA]",0.023682
230,"MSA[T.New Orleans-Metairie-Kenner, LA]",0.023409
207,"MSA[T.Memphis, TN-MS-AR]",0.017181
90,"MSA[T.Detroit-Warren-Livonia, MI]",0.013845


Rename columns to remove spaces     
Assign binary categories to 0, 1     
Select important cities and make into dummy variables. Group all others and set to "other"     
Make month, race, and weapon into dummy categories.    
Assign other categories that are heavily weighted to one value to 1, the others to 0    

In [14]:
df.columns= [
'Agentype','MSA', 'Month', 'Solved', 
'Vic_Age', 'Vic_Count' ,'Vic_Race', 'Vic_Sex', 'Weapon', 'Year']
df['Solved'] = df['Solved'].map({'Yes': 1, 'No': 0})
df['vic_sex'] = df['Vic_Sex'].map({'Female': 1, 'Male': 0})
df['agent_type'] = np.where(df['Agentype']=='Municipal police', 1, 0)
list_of_cities4 = [
'New York-New Jersey-Long Island, NY-NJ',
'Los Angeles-Long Beach, CA',                      
'Houston-Sugar Land-Baytown, TX',                  
'Detroit-Warren-Livonia, MI',                      
'Dallas-Fort Worth-Arlington, TX',                 
'Philadelphia-Camden-Wilmington, PA-NJ-DE',        
'Washington-Arlington-Alexandria, DC-VA-MD-WV',   
'Chicago-Naperville-Joliet, IL-IN-WI',              
'San Francisco-Oakland-Fremont, CA' ,               
'Baltimore-Towson, MD' ,                           
'Miami-Fort Lauderdale, FL',                       
'Atlanta-Sandy Springs-Marietta, GA',               
'New Orleans-Metairie-Kenner, LA',                 
'Riverside-San Bernardino, CA',                  
'St. Louis, MO-IL',
'Boston-Cambridge-Quincy, MA-NH',
'Birmingham-Hoover, AL',
'Bridgeport-Stamford-Norwalk, CT',
'Memphis, TN-MS-AR']  

def city_other(city):
    if city in list_of_cities4:
        return city
    else:
        return 'Other'

df['msa']= df['MSA'].apply(city_other)

df=df.dropna()
msa=patsy.dmatrix('msa',data=df,return_type='dataframe')
month=patsy.dmatrix('Month',data=df,return_type='dataframe')
vicrace=patsy.dmatrix('Vic_Race',data=df,return_type='dataframe')
weapon=patsy.dmatrix('Weapon',data=df,return_type='dataframe')

df1 = pd.concat([df, msa], axis = 1)
df3 = pd.concat([df1, month], axis = 1)
df4 = pd.concat([df3, vicrace], axis = 1)
df5 = pd.concat([df4, weapon], axis = 1)

df5 is the final dataframe to use for modeling

### Exploratory Data Analysis

The Murder Accountability Project has a nice website with tableau dashboards  (http://murderdata.org/)
A lot of EDA was done interactively.

### Pre-Modeling

Create dataframe X with all relevant features as columns.  Original columns were not removed.        
Create target dataframe y     
Create training and testing data sets.    
Scale data    

In [17]:
X = df5[[ 
'Vic_Age', 'Vic_Count', 'Year',
'vic_sex', 'agent_type', 
'msa[T.Baltimore-Towson, MD]',
'msa[T.Birmingham-Hoover, AL]', 'msa[T.Boston-Cambridge-Quincy, MA-NH]',
'msa[T.Bridgeport-Stamford-Norwalk, CT]',
'msa[T.Chicago-Naperville-Joliet, IL-IN-WI]',
'msa[T.Dallas-Fort Worth-Arlington, TX]',
'msa[T.Detroit-Warren-Livonia, MI]',
'msa[T.Houston-Sugar Land-Baytown, TX]',
'msa[T.Los Angeles-Long Beach, CA]', 'msa[T.Memphis, TN-MS-AR]',
'msa[T.Miami-Fort Lauderdale, FL]',
'msa[T.New Orleans-Metairie-Kenner, LA]',
'msa[T.New York-New Jersey-Long Island, NY-NJ]', 'msa[T.Other]',
'msa[T.Philadelphia-Camden-Wilmington, PA-NJ-DE]',
'msa[T.Riverside-San Bernardino, CA]',
'msa[T.San Francisco-Oakland-Fremont, CA]', 'msa[T.St. Louis, MO-IL]',
'msa[T.Washington-Arlington-Alexandria, DC-VA-MD-WV]',   
'Month[T.August]', 'Month[T.December]', 'Month[T.February]',
'Month[T.January]', 'Month[T.July]', 'Month[T.June]', 'Month[T.March]',
'Month[T.May]', 'Month[T.November]', 'Month[T.October]',
'Month[T.September]', 
'Vic_Race[T.Asian or Pacific Islander]', 'Vic_Race[T.Black]',
'Vic_Race[T.White]', 
'Weapon[T.Blunt object - hammer, club, etc]', 'Weapon[T.Drowning]',
'Weapon[T.Explosives]', 'Weapon[T.Fire]',
'Weapon[T.Firearm, type not stated]',
'Weapon[T.Handgun - pistol, revolver, etc]',
'Weapon[T.Knife or cutting instrument]',
'Weapon[T.Narcotics or drugs, sleeping pills]', 'Weapon[T.Other gun]',
'Weapon[T.Other or type unknown]',
'Weapon[T.Personal weapons, includes beating]',
'Weapon[T.Poison - does not include gas]',
'Weapon[T.Pushed or thrown out window]', 'Weapon[T.Rifle]',
'Weapon[T.Shotgun]', 'Weapon[T.Strangulation - hanging]']]
        
y = df5['Solved']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)  

### Modeling
The most important metric in this project was recall.   
Training and testing accuracy were output to validate there was no overfitting.


#### knn

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_train = knn.predict(X_train)
y_pred_test = knn.predict(X_test)

print (confusion_matrix(y_test,y_pred_test))
print(classification_report (y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

The knn algorithm was not optimized because    
1.  It takes a very long time to run    
2. There was a very low probability this would be the best algorithm.    
To optimize it, the following hyperparameters would be tested:           
k_list = [10, 50, 100]    
weight_options = ['uniform', 'distance']     

#### Logistic Regression

In [8]:
lr_model_all = LogisticRegression()
lr_model_all.fit(X_train, y_train)

y_pred_train = lr_model_all.predict(X_train)
y_pred_test = lr_model_all.predict(X_test)

print (confusion_matrix(y_test,y_pred_test))
print(classification_report (y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

[[  892  5350]
 [  702 15076]]
             precision    recall  f1-score   support

          0       0.56      0.14      0.23      6242
          1       0.74      0.96      0.83     15778

avg / total       0.69      0.73      0.66     22020

Train Accuracy: 0.7192487349163098
Test Accuracy: 0.7251589464123525


Optimized Linear Regression

In [9]:
logit = LogisticRegression()
degree = [1,2,3]
scores = ['recall']
hyperparameters = {'C':[0.01,0.1,1], 'penalty' : ['l1', 'l2']}

lr_grid2 = make_pipeline(PolynomialFeatures(degree))
lr_grid2 = GridSearchCV(logit, param_grid = hyperparameters, scoring= scores, cv=10)
lr_grid2.fit(X_train, y_train)
y_pred_train, y_pred_test = lr_grid2.predict(X_train), lr_grid2.predict(X_test)

print("Best parameters set found on development set:")
print()

print(lr_grid2.best_params_)
print()

print("Grid scores on development set:")
print()
means = lr_grid2.cv_results_['mean_test_score']
stds = lr_grid2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, lr_grid2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

print(classification_report(y_test, y_pred_test))
print (confusion_matrix(y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 0.01, 'penalty': 'l2'}

Grid scores on development set:

0.648 (+/-0.038) for {'C': 0.01, 'penalty': 'l1'}
0.651 (+/-0.030) for {'C': 0.01, 'penalty': 'l2'}
0.650 (+/-0.030) for {'C': 0.1, 'penalty': 'l1'}
0.651 (+/-0.030) for {'C': 0.1, 'penalty': 'l2'}
0.651 (+/-0.029) for {'C': 1, 'penalty': 'l1'}
0.651 (+/-0.030) for {'C': 1, 'penalty': 'l2'}

Detailed classification report:

             precision    recall  f1-score   support

          0       0.56      0.14      0.23      6242
          1       0.74      0.96      0.83     15778

avg / total       0.69      0.73      0.66     22020

[[  892  5350]
 [  702 15076]]

Train Accuracy: 0.7194433631763332
Test Accuracy: 0.7251589464123525
# Tuning hyper-parameters for recall

Best parameters set found on development set:

{'C': 0.1, 'penalty': 'l2'}

Grid scores on development set:

0.539 (+/-0.009) for {'C': 0.01, 'penalty': 'l1'}
0.550 (+/-

#### SVC: Linear Model

In [41]:
svml = LinearSVC()
svml.fit(X_train, y_train)
y_pred_train = svml.predict(X_train) 
y_pred_test = svml.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

[[  655  5587]
 [  498 15280]]
             precision    recall  f1-score   support

          0       0.57      0.10      0.18      6242
          1       0.73      0.97      0.83     15778

avg / total       0.69      0.72      0.65     22020

Train Accuracy: 0.7164266251459712
Test Accuracy: 0.7236603088101726


Optimized Linear SVC

In [11]:
svml = LinearSVC()
param_grid = {'C': [1e-3, 1e-1,1.0]}
score = 'recall'

svmlg = GridSearchCV(svml, param_grid=param_grid, scoring = score, cv =10)  
svmlg.fit(X_train, y_train)
y_pred_train, y_pred_test = svmlg.predict(X_train), svmlg.predict(X_test)

print("Best parameters set found on development set:")
print()

print(svmlg.best_params_)
print()

print("Grid scores on development set:")
print()

means = svmlg.cv_results_['mean_test_score']
stds = svml.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, svmlg.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

print(classification_report (y_test,y_pred_test))
print (confusion_matrix(y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

Grid scores on development set:

0.717 (+/-0.008) for {'C': 0.001}
0.717 (+/-0.008) for {'C': 0.1}
0.716 (+/-0.007) for {'C': 1.0}

             precision    recall  f1-score   support

          0       0.57      0.11      0.18      6242
          1       0.73      0.97      0.83     15778

avg / total       0.69      0.72      0.65     22020

[[  669  5573]
 [  511 15267]]
Train Accuracy: 0.7170688984040483
Test Accuracy: 0.7237057220708447


#### SVC 

In [90]:
svm = SVC(kernel='rbf', probability=True)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
y_pred_train, y_pred_test = svm.predict(X_train), svm.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test, y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

[[ 1566  4771]
 [ 1116 14580]]
             precision    recall  f1-score   support

          0       0.58      0.25      0.35      6337
          1       0.75      0.93      0.83     15696

avg / total       0.70      0.73      0.69     22033

Train Accuracy: 0.744884064737006
Test Accuracy: 0.7328098760949485


The SVC algorithm was not optimized because it takes a very long time to run     
There was a very low probability this would be the best algorithm.      
To optimize it, the following hyperparameters would be tested:     
{'C': [1.0, 10., 100.], 'degree':[2,3,4],'gamma':[1e-5,1e-3,1]}

#### Random Forest
This is one of the models used so the feature importances was printed.

In [13]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test= rf.predict(X_test)

print(classification_report (y_test,y_pred_test))
print (confusion_matrix(y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

[3.24512008e-01 2.26095549e-02 2.91705283e-01 1.93740015e-02
 1.78159945e-02 3.42193492e-03 1.88691370e-03 2.64500922e-03
 1.29994876e-03 5.73965641e-03 4.74170905e-03 5.23707085e-03
 5.89917544e-03 8.19120165e-03 3.36964734e-03 4.99903792e-03
 3.56619900e-03 9.87786229e-03 2.05893627e-02 4.66280766e-03
 3.14141510e-03 4.26932288e-03 3.64632501e-03 5.59230251e-03
 1.19575099e-02 1.21044912e-02 1.15962160e-02 1.13977175e-02
 1.24769277e-02 1.13534201e-02 1.15149557e-02 1.16285455e-02
 9.54074437e-03 1.08124517e-02 1.11462628e-02 3.13559938e-03
 8.77431096e-03 7.96809146e-03 5.64500553e-03 5.77031090e-04
 5.07419578e-04 2.33457704e-03 1.38921055e-02 1.05221097e-02
 9.92158889e-03 7.89401046e-04 1.92749187e-03 9.95602650e-03
 4.72180292e-03 2.18844763e-04 8.06124927e-05 4.86355560e-03
 5.38284007e-03 4.45859809e-03]
             precision    recall  f1-score   support

          0       0.47      0.43      0.45      6242
          1       0.78      0.81      0.79     15778

avg / total   

Optimized Random Forest

In [316]:
randomforest = RandomForestClassifier()
param_grid={'n_estimators': [5, 10, 20, 50, 100], 'max_depth': [10, 20, 30, 50, 100, 200]}
scores = ['recall']

rfg3 = GridSearchCV(randomforest, param_grid=param_grid, scoring=scores, cv =10)
rfg3.fit(X_train, y_train)
y_pred_train, y_pred_test = rfg3.predict(X_train), rfg3.predict(X_test)

print("Best parameters set found on development set:")
print()
print(rfg3.best_params_)
print()
print("Grid scores on development set:")
print()
means = rfg3.cv_results_['mean_test_score']
stds = rfg3.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rfg3.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

print(classification_report (y_test,y_pred_test))
print (confusion_matrix(y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'max_depth': 10, 'n_estimators': 100}

Grid scores on development set:

0.674 (+/-0.033) for {'max_depth': 10, 'n_estimators': 5}
0.677 (+/-0.032) for {'max_depth': 10, 'n_estimators': 10}
0.682 (+/-0.033) for {'max_depth': 10, 'n_estimators': 20}
0.683 (+/-0.034) for {'max_depth': 10, 'n_estimators': 50}
0.688 (+/-0.038) for {'max_depth': 10, 'n_estimators': 100}
0.648 (+/-0.021) for {'max_depth': 20, 'n_estimators': 5}
0.665 (+/-0.024) for {'max_depth': 20, 'n_estimators': 10}
0.674 (+/-0.023) for {'max_depth': 20, 'n_estimators': 20}
0.682 (+/-0.021) for {'max_depth': 20, 'n_estimators': 50}
0.684 (+/-0.024) for {'max_depth': 20, 'n_estimators': 100}
0.627 (+/-0.014) for {'max_depth': 30, 'n_estimators': 5}
0.641 (+/-0.018) for {'max_depth': 30, 'n_estimators': 10}
0.653 (+/-0.023) for {'max_depth': 30, 'n_estimators': 20}
0.667 (+/-0.017) for {'max_depth': 30, 'n_estimators': 50}
0.667 (+/-0.02

#### Gradient Boosting

In [None]:
gbt = GradientBoostingClassifier()
gbt.fit(X_train, y_train)
y_pred_train = gbt.predict(X_train)
y_pred_test= gbt.predict(X_test)


print(classification_report (y_test,y_pred_test))
print (confusion_matrix(y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

Optimized Gradient Boosting

In [None]:
gradient = GradientBoostingClassifier()
param_grid = {'n_estimators': [800,1000]}
scores = ['recall']

gbtg2 = GridSearchCV(gradient, param_grid=param_grid, scoring = scores, cv =10)
gbtg2.fit(X_train, y_train)
y_pred_train = gbtg2.predict(X_train)
y_pred_test= gbtg2.predict(X_test)

print("Best parameters set found on development set:")
print()
print(gbtg2.best_params_)
print()
print("Grid scores on development set:")
print()
means = gbtg2.cv_results_['mean_test_score']
stds = gbtg2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, gbtg2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

print(classification_report (y_test,y_pred_test))
print (confusion_matrix(y_test,y_pred_test))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

#### Naive Bayes

In [17]:
bnb = BernoulliNB() 
bnb.fit(X_train, y_train)
y_pred_train = bnb.predict(X_train)
y_pred_test= bnb.predict(X_test)

print (confusion_matrix(y_test,y_pred_test))
print(classification_report(y_test, bnb.predict(X_test)))
print('Train Accuracy:',metrics.accuracy_score(y_train, y_pred_train))
print('Test Accuracy:',metrics.accuracy_score(y_test, y_pred_test))

Accuracy: 0.710
[[ 2011  4231]
 [ 2157 13621]]
             precision    recall  f1-score   support

          0       0.48      0.32      0.39      6242
          1       0.76      0.86      0.81     15778

avg / total       0.68      0.71      0.69     22020

Train Accuracy: 0.7068314519268197
Test Accuracy: 0.7099000908265214


### Analysis

Create a baseline.  70% of the cases were solved, 30% not solved.  By always guessing they were solved,
a person would be correct 70% of the time.

In [133]:
y_guess_1 = y_test*0+1

   0      1
0  0   6337
1  0  15696
      0      1
0  2330   4007
1  2053  13643
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      6337
          1       0.71      1.00      0.83     15696

avg / total       0.51      0.71      0.59     22033

             precision    recall  f1-score   support

          0       0.53      0.37      0.43      6337
          1       0.77      0.87      0.82     15696

avg / total       0.70      0.72      0.71     22033

Guess Accuracy: 0.7123859665047882
Random Forest Accuracy: 0.7249580175191758


  'precision', 'predicted', average, warn_for)


AUC curves are calculated to determine which alorithm is best for both precision and recall.

y_pred is not a prediction of 0 or 1.  It is a probability.  Should have used different name.

When SVMLinear is run with predict_proba, an error of "SVM needs decision_function instead of predict_proba"
is given.  A workaround is using decision_function. SVM with the kernel = linear would have worked also
Decision function was used for svm also, but predict_proba should have been used.
  

In [33]:
y_pred_knn = knn.predict_proba(X_test)[:, 1]
fpr_knn, tpr_knn, _ = roc_curve(y_test, y_pred_knn)

y_pred_lrg2 = lrg2.predict_proba(X_test)[:, 1]
fpr_lrg2, tpr_lrg2, _ = roc_curve(y_test, y_pred_lrg2)

y_pred_svml = svml.predict_proba(X_test)
fpr_svml, tpr_svml, _ = roc_curve(y_test, y_pred_svml)

y_pred_svmlg = svmlg.decision_function(X_test)
fpr_svmlg, tpr_svmlg, _ = roc_curve(y_test, y_pred_svmlg)

y_pred_svm = svm.decision_function(X_test)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm)

y_pred_rfg3 = rfg3.predict_proba(X_test)[:, 1]
fpr_rfg3, tpr_rfg3, _ = roc_curve(y_test, y_pred_rfg3)

y_pred_gbtg2 = gbtg2.predict_proba(X_test)[:, 1]
fpr_gbtg2, tpr_gbtg2, _ = roc_curve(y_test, y_pred_gbtg2)

y_pred_bnb = bnb.predict_proba(X_test)[:, 1]
fpr_bnb, tpr_bnb, _ = roc_curve(y_test, y_pred_bnb)

y_guess_1 = y_test*0+1
fpr_guess, tpr_guess, _ = roc_curve(y_test, y_guess_1)

plt.figure(1)
plt.figure(figsize=[10,6])
plt.plot([0, 1], [0, 1], 'k--')

plt.plot(fpr_knn, tpr_knn, label='KNN (unoptimized)', c='0.8')
plt.plot(fpr_lrg2, tpr_lrg2, label='Logistic Regression',c='0.8')
plt.plot(fpr_svmlg, tpr_svmlg, label='SVM Linear',c='0.8')
plt.plot(fpr_svm, tpr_svm, label='SVM (unoptimized)',c='0.8')
plt.plot(fpr_rfg3, tpr_rfg3, label='Random Forest',c='b')
plt.plot(fpr_gbtg2, tpr_gbtg2, label='Gradient Boosting',c='r')
plt.plot(fpr_bnb, tpr_bnb, label='Naive Bayes (Bernoulli)',c= '0.8')

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()



In [342]:
print ('AUC scores')
print ('KNN: {}'.format(round(roc_auc_score(y_test, y_pred_knn),3)))
print ('Linear Regression: {}'.format(round(roc_auc_score(y_test, y_pred_lrg2),3)))
print ('Linear SVC: {}'.format(round(roc_auc_score(y_test, y_pred_svmlg),3)))
print ('SVC: {}'.format(round(roc_auc_score(y_test, y_pred_svm),3)))
print ('Random Forest: {}'.format(round(roc_auc_score(y_test, y_pred_rfg3),3)))
print ('Gradient Boosting: {}'.format(round(roc_auc_score(y_test, y_pred_gbtg2),3)))
print ('Naive Bayes: {}'.format(round(roc_auc_score(y_test, y_pred_bnb),3)))

AUC scores
KNN: 0.647
Linear Regression: 0.683
Linear SVC: 0.683
SVC: 0.679
Random Forest: 0.675
Gradient Boosting: 0.721
Naive Bayes: 0.662


Get recall metric   
Need to overwrite y predictions in memory that hold probabilities   

In [326]:
y_pred_KNN= knn.predict(X_test)
y_pred_lrg2= lrg2.predict(X_test)
y_pred_svmlg= svmlg.predict(X_test)
y_pred_svm= svm.predict(X_test)
y_pred_rfg3= rfg3.predict(X_test)
y_pred_gbtg2= gbtg2.predict(X_test)
y_pred_bnb= bnb.predict(X_test)

print ('Recall scores')
print ('KNN: {}'.format(round(recall_score(y_test, y_pred_KNN,pos_label=0),3)))
print ('Linear Regression: {}'.format(round(recall_score(y_test, y_pred_lrg2,pos_label=0),3)))
print ('Linear SVC: {}'.format(round(recall_score(y_test, y_pred_svmlg,pos_label=0),3)))
print ('SVC: {}'.format(round(recall_score(y_test, y_pred_svm,pos_label=0),3)))
print ('Random Forest: {}'.format(round(recall_score(y_test, y_pred_rfg3, pos_label=0),3)))
print ('Gradient Boosting: {}'.format(round(recall_score(y_test, y_pred_gbtg2, pos_label=0),3)))
print ('Naive Bayes: {}'.format(round(recall_score(y_test, y_pred_bnb, pos_label=0),3)))
print()
print ('Baseline: {}'.format(round(recall_score(y_test, y_guess_1, pos_label=0),3)))


In [None]:
table = list(zip(X,rfg3.best_estimator_.feature_importances_))
cols = X.columns
dftable = pd.DataFrame(table, columns=['cols','Coef'])
dftable.sort_values(by='Coef', ascending=False)

### Test Case

Adjust the values of one homicide case for a test case

In [None]:
cols = X.columns
vals = X.iloc[0]
table = list(zip(cols,vals))
dftable = pd.DataFrame(table, columns=['Features','vals'])

Need a list of column names with their index to identify which values to change

In [274]:
for ele in enumerate(cols):
    print (ele)

(0, 'Vic_Age')
(1, 'Vic_Count')
(2, 'Year')
(3, 'vic_sex')
(4, 'agent_type')
(5, 'msa[T.Baltimore-Towson, MD]')
(6, 'msa[T.Birmingham-Hoover, AL]')
(7, 'msa[T.Boston-Cambridge-Quincy, MA-NH]')
(8, 'msa[T.Bridgeport-Stamford-Norwalk, CT]')
(9, 'msa[T.Chicago-Naperville-Joliet, IL-IN-WI]')
(10, 'msa[T.Dallas-Fort Worth-Arlington, TX]')
(11, 'msa[T.Detroit-Warren-Livonia, MI]')
(12, 'msa[T.Houston-Sugar Land-Baytown, TX]')
(13, 'msa[T.Los Angeles-Long Beach, CA]')
(14, 'msa[T.Memphis, TN-MS-AR]')
(15, 'msa[T.Miami-Fort Lauderdale, FL]')
(16, 'msa[T.New Orleans-Metairie-Kenner, LA]')
(17, 'msa[T.New York-New Jersey-Long Island, NY-NJ]')
(18, 'msa[T.Other]')
(19, 'msa[T.Philadelphia-Camden-Wilmington, PA-NJ-DE]')
(20, 'msa[T.Riverside-San Bernardino, CA]')
(21, 'msa[T.San Francisco-Oakland-Fremont, CA]')
(22, 'msa[T.St. Louis, MO-IL]')
(23, 'msa[T.Washington-Arlington-Alexandria, DC-VA-MD-WV]')
(24, 'Month[T.August]')
(25, 'Month[T.December]')
(26, 'Month[T.February]')
(27, 'Month[T.January

Test case is a 44 year old woman killed in Seattle in August, the Seattle police investigated, by poisoning

In [372]:
laura[0] = 44
laura[2] = 2018
laura[3] = 1
laura[18] = 1
laura[23] = 0
laura[24] = 1
laura[34] = 0
laura[36] = 0
laura[37] = 1
laura[42] = 0
laura[47] = 0
laura[49] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [373]:
table = list(zip(cols,laura))
dftable1 = pd.DataFrame(table)

In [374]:
laura_vals= dftable1
trans = laura_vals.T
laura = trans.drop([0, 0])
laura


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,...,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
1,44,0,2018,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


rfg3.predict_proba(laura)