## Imports

In [2]:
# %matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import seaborn as sns
from subprocess import call
from collections import defaultdict
import pandas_profiling

## Constants

In [3]:
TRAINING_DATA = '../datasets/fatal.csv'
d = defaultdict(LabelEncoder)

## Wrangle Data

In [15]:
datapoints = pd.read_csv(TRAINING_DATA)
profiling_datapoints = datapoints
datapoints = datapoints.drop('Unnamed: 0', axis=1)\
                       .drop('name', axis=1)\
                       .drop('date', axis=1)\
                       .drop('race', axis=1)\
                       .drop('id', axis=1)\
                       .dropna()
input_features = datapoints.drop('manner_of_death', axis=1)
expected = datapoints['manner_of_death']


In [5]:
input_features.head()


Unnamed: 0,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,gun,53.0,M,Shelton,WA,True,attack,Not fleeing,False,7073146,0
1,gun,47.0,M,Aloha,OR,False,attack,Not fleeing,False,3982267,1
2,unarmed,23.0,M,Wichita,KS,False,other,Not fleeing,False,2898292,2
3,toy weapon,32.0,M,San Francisco,CA,True,attack,Not fleeing,False,38654206,1
4,nail gun,39.0,M,Evans,CO,False,attack,Not fleeing,False,5359295,2


In [6]:
expected.head()

0                shot
1                shot
2    shot and Tasered
3                shot
4                shot
Name: manner_of_death, dtype: object

Encode Data:

In [7]:
# input_features_encoded = input_features.apply(LabelEncoder().fit_transform)
input_features_encoded = input_features.apply(lambda x: d[x.name].fit_transform(x))

input_features_encoded.head()

Unnamed: 0,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,28,43,1,1440,47,1,0,2,0,38,0
1,28,37,1,24,37,0,0,2,0,24,1
2,72,12,1,1699,16,0,1,2,0,17,2
3,71,21,1,1389,4,1,0,2,0,50,1
4,50,29,1,474,5,0,0,2,0,29,2


## Create/Train Classifier

Create training sets:

In [8]:
X_train, X_test, y_train, y_test = train_test_split(input_features_encoded, expected, random_state=1)

Create model and train:

In [9]:
model = tree.DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Visualize Results

In [10]:
y_predict = model.predict(X_test)
"Accuracy Score: " + str(accuracy_score(y_test.as_matrix(), y_predict))


'Accuracy Score: 0.948905109489'

Confusion Matrix:

In [11]:
pd.DataFrame(
        confusion_matrix(y_test, y_predict),
        columns=['Predicted 1', 'Predicted 2'],
        index=['Actual 1', 'Actual 2']
    )

Unnamed: 0,Predicted 1,Predicted 2
Actual 1,780,0
Actual 2,42,0


Generate tree map:

In [12]:
tree.export_graphviz(model, out_file='manner_of_death_classification.dot', feature_names=input_features.columns)
call(['dot', '-T', 'png', './manner_of_death_classification.dot', '-o', './manner_of_death_classification.png'])

0

Translate Split 'state_pop' Nodes of Interest:

In [13]:
le = LabelEncoder()
le.fit(input_features['armed'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{'BB gun': 0,
 'Taser': 1,
 'air conditioner': 2,
 'ax': 3,
 'baseball bat': 4,
 'baseball bat and bottle': 5,
 'baseball bat and fireplace poker': 6,
 'baton': 7,
 'bayonet': 8,
 'bean-bag gun': 9,
 'beer bottle': 10,
 'blunt object': 11,
 'box cutter': 12,
 'brick': 13,
 'carjack': 14,
 'chain': 15,
 'chain saw': 16,
 'chainsaw': 17,
 'chair': 18,
 "contractor's level": 19,
 'cordless drill': 20,
 'crossbow': 21,
 'crowbar': 22,
 'fireworks': 23,
 'flagpole': 24,
 'flashlight': 25,
 'garden tool': 26,
 'glass shard': 27,
 'gun': 28,
 'gun and car': 29,
 'gun and knife': 30,
 'gun and sword': 31,
 'guns and explosives': 32,
 'hammer': 33,
 'hand torch': 34,
 'hatchet': 35,
 'hatchet and gun': 36,
 'incendiary device': 37,
 'knife': 38,
 'lawn mower blade': 39,
 'machete': 40,
 'machete and gun': 41,
 'meat cleaver': 42,
 'metal hand tool': 43,
 'metal object': 44,
 'metal pipe': 45,
 'metal pole': 46,
 'metal rake': 47,
 'metal stick': 48,
 'motorcycle': 49,
 'nail gun': 50,
 'oar': 5

# Profiling

In [16]:
pandas_profiling.ProfileReport(profiling_datapoints)

0,1
Number of variables,17
Number of observations,3399
Total Missing (%),0.0%
Total size in memory,405.0 KiB
Average record size in memory,122.0 B

0,1
Numeric,4
Categorical,10
Boolean,2
Date,0
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.629
Minimum,0
Maximum,5
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,3
95-th percentile,3
Maximum,5
Range,5
Interquartile range,2

0,1
Standard deviation,1.0745
Coef of variation,0.65961
Kurtosis,-0.018772
Mean,1.629
MAD,0.92222
Skewness,0.64443
Sum,5537
Variance,1.1546
Memory size,26.6 KiB

Value,Count,Frequency (%),Unnamed: 3
1,1606,0.0%,
3,803,0.0%,
2,562,0.0%,
0,342,0.0%,
5,54,0.0%,
4,32,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,342,0.0%,
1,1606,0.0%,
2,562,0.0%,
3,803,0.0%,
4,32,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1606,0.0%,
2,562,0.0%,
3,803,0.0%,
4,32,0.0%,
5,54,0.0%,

0,1
Distinct count,3399
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1699
Minimum,0
Maximum,3398
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,169.9
Q1,849.5
Median,1699.0
Q3,2548.5
95-th percentile,3228.1
Maximum,3398.0
Range,3398.0
Interquartile range,1699.0

0,1
Standard deviation,981.35
Coef of variation,0.57761
Kurtosis,-1.2
Mean,1699
MAD,849.75
Skewness,0
Sum,5774901
Variance,963050
Memory size,26.6 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
2628,1,0.0%,
2624,1,0.0%,
573,1,0.0%,
2620,1,0.0%,
569,1,0.0%,
2616,1,0.0%,
565,1,0.0%,
2612,1,0.0%,
561,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
3394,1,0.0%,
3395,1,0.0%,
3396,1,0.0%,
3397,1,0.0%,
3398,1,0.0%,

0,1
Distinct count,77
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,36.833
Minimum,6
Maximum,91
Zeros (%),0.0%

0,1
Minimum,6
5-th percentile,19
Q1,27
Median,35
Q3,45
95-th percentile,60
Maximum,91
Range,85
Interquartile range,18

0,1
Standard deviation,12.809
Coef of variation,0.34775
Kurtosis,0.21757
Mean,36.833
MAD,10.214
Skewness,0.72123
Sum,125200
Variance,164.07
Memory size,26.6 KiB

Value,Count,Frequency (%),Unnamed: 3
25.0,134,0.0%,
36.8331303289,115,0.0%,
31.0,114,0.0%,
36.0,110,0.0%,
24.0,110,0.0%,
32.0,108,0.0%,
27.0,108,0.0%,
33.0,104,0.0%,
29.0,103,0.0%,
35.0,101,0.0%,

Value,Count,Frequency (%),Unnamed: 3
6.0,2,0.0%,
12.0,1,0.0%,
13.0,1,0.0%,
14.0,2,0.0%,
15.0,11,0.0%,

Value,Count,Frequency (%),Unnamed: 3
82.0,2,0.0%,
83.0,2,0.0%,
84.0,1,0.0%,
86.0,2,0.0%,
91.0,1,0.0%,

0,1
Distinct count,78
Unique (%),0.0%
Missing (%),100.0%
Missing (n),6

0,1
gun,1893
knife,495
vehicle,239
Other values (74),766

Value,Count,Frequency (%),Unnamed: 3
gun,1893,0.0%,
knife,495,0.0%,
vehicle,239,0.0%,
unarmed,236,0.0%,
undetermined,133,0.0%,
toy weapon,124,0.0%,
unknown weapon,37,0.0%,
machete,31,0.0%,
Taser,13,0.0%,
ax,13,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.10503

0,1
False,3042
True,357

Value,Count,Frequency (%),Unnamed: 3
False,3042,0.0%,
True,357,0.0%,

0,1
Distinct count,1789
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Los Angeles,52
Phoenix,45
Houston,34
Other values (1786),3268

Value,Count,Frequency (%),Unnamed: 3
Los Angeles,52,0.0%,
Phoenix,45,0.0%,
Houston,34,0.0%,
Columbus,29,0.0%,
Chicago,28,0.0%,
Las Vegas,28,0.0%,
San Antonio,25,0.0%,
Austin,22,0.0%,
Miami,21,0.0%,
Albuquerque,20,0.0%,

0,1
Distinct count,1172
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
2018-04-01,9
2016-12-21,8
2018-01-06,8
Other values (1169),3374

Value,Count,Frequency (%),Unnamed: 3
2018-04-01,9,0.0%,
2016-12-21,8,0.0%,
2018-01-06,8,0.0%,
2017-02-10,8,0.0%,
2017-07-04,8,0.0%,
2017-12-26,8,0.0%,
2016-01-27,8,0.0%,
2018-03-23,8,0.0%,
2015-12-14,8,0.0%,
2015-07-07,8,0.0%,

0,1
Distinct count,5
Unique (%),0.0%
Missing (%),100.0%
Missing (n),106

0,1
Not fleeing,2207
Car,543
Foot,424

Value,Count,Frequency (%),Unnamed: 3
Not fleeing,2207,0.0%,
Car,543,0.0%,
Foot,424,0.0%,
Other,119,0.0%,
(Missing),106,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),100.0%
Missing (n),5

0,1
M,3247
F,147
(Missing),5

Value,Count,Frequency (%),Unnamed: 3
M,3247,0.0%,
F,147,0.0%,
(Missing),5,0.0%,

0,1
Correlation,0.99724

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
shot,3201
shot and Tasered,198

Value,Count,Frequency (%),Unnamed: 3
shot,3201,0.0%,
shot and Tasered,198,0.0%,

0,1
Distinct count,3300
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
TK TK,89
TK Tk,2
Eric Harris,2
Other values (3297),3306

Value,Count,Frequency (%),Unnamed: 3
TK TK,89,0.0%,
TK Tk,2,0.0%,
Eric Harris,2,0.0%,
Daquan Antonio Westbrook,2,0.0%,
Michael Johnson,2,0.0%,
Brandon Jones,2,0.0%,
Christian Chavez,2,0.0%,
George Tillman,2,0.0%,
Michael Brown,2,0.0%,
Richard Rangel,2,0.0%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),100.0%
Missing (n),289

0,1
W,1606
B,803
H,562
Other values (3),139
(Missing),289

Value,Count,Frequency (%),Unnamed: 3
W,1606,0.0%,
B,803,0.0%,
H,562,0.0%,
N,54,0.0%,
A,53,0.0%,
O,32,0.0%,
(Missing),289,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.24036

0,1
False,2582
True,817

Value,Count,Frequency (%),Unnamed: 3
False,2582,0.0%,
True,817,0.0%,

0,1
Distinct count,51
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
CA,540
TX,302
FL,208
Other values (48),2349

Value,Count,Frequency (%),Unnamed: 3
CA,540,0.0%,
TX,302,0.0%,
FL,208,0.0%,
AZ,166,0.0%,
CO,108,0.0%,
GA,104,0.0%,
OK,104,0.0%,
OH,102,0.0%,
NC,92,0.0%,
MO,86,0.0%,

0,1
Distinct count,51
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,14247000
Minimum,583029
Maximum,38654206
Zeros (%),0.0%

0,1
Minimum,583029
5-th percentile,1635500
Q1,4834600
Median,8310300
Q3,19934000
95-th percentile,38654000
Maximum,38654206
Range,38071177
Interquartile range,15100000

0,1
Standard deviation,12757000
Coef of variation,0.89543
Kurtosis,-0.56835
Mean,14247000
MAD,10893000
Skewness,0.95739
Sum,48424796398
Variance,1.6274e+14
Memory size,26.6 KiB

Value,Count,Frequency (%),Unnamed: 3
38654206,540,0.0%,
26956435,302,0.0%,
19934451,208,0.0%,
6728577,166,0.0%,
5359295,108,0.0%,
3875589,104,0.0%,
10099320,104,0.0%,
11586941,102,0.0%,
9940828,92,0.0%,
6059651,86,0.0%,

Value,Count,Frequency (%),Unnamed: 3
583029,11,0.0%,
626249,6,0.0%,
659009,11,0.0%,
736162,7,0.0%,
736855,22,0.0%,

Value,Count,Frequency (%),Unnamed: 3
12851684,76,0.0%,
19697457,57,0.0%,
19934451,208,0.0%,
26956435,302,0.0%,
38654206,540,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
attack,2134
other,1097
undetermined,168

Value,Count,Frequency (%),Unnamed: 3
attack,2134,0.0%,
other,1097,0.0%,
undetermined,168,0.0%,

Unnamed: 0.1,Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False,7073146,0
1,1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False,3982267,1
2,2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False,2898292,2
3,3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False,38654206,1
4,4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False,5359295,2
