## Imports

In [2]:
# %matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import seaborn as sns
from subprocess import call
from collections import defaultdict
import pandas_profiling

## Constants

In [3]:
TRAINING_DATA = '../datasets/fatal.csv'
d = defaultdict(LabelEncoder)

## Wrangle Data

In [4]:
datapoints = pd.read_csv(TRAINING_DATA)
profiling_datapoints = datapoints
datapoints = datapoints.drop('Unnamed: 0', axis=1)\
                       .drop('name', axis=1)\
                       .drop('date', axis=1)\
                       .drop('race', axis=1)\
                       .drop('id', axis=1)\
                       .dropna()
input_features = datapoints.drop('manner_of_death', axis=1)
expected = datapoints['manner_of_death']


In [5]:
input_features.head()


Unnamed: 0,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,gun,53.0,M,Shelton,WA,True,attack,Not fleeing,False,7073146,0
1,gun,47.0,M,Aloha,OR,False,attack,Not fleeing,False,3982267,1
2,unarmed,23.0,M,Wichita,KS,False,other,Not fleeing,False,2898292,2
3,toy weapon,32.0,M,San Francisco,CA,True,attack,Not fleeing,False,38654206,1
4,nail gun,39.0,M,Evans,CO,False,attack,Not fleeing,False,5359295,2


In [6]:
expected.head()

0                shot
1                shot
2    shot and Tasered
3                shot
4                shot
Name: manner_of_death, dtype: object

Encode Data:

In [7]:
# input_features_encoded = input_features.apply(LabelEncoder().fit_transform)
input_features_encoded = input_features.apply(lambda x: d[x.name].fit_transform(x))

input_features_encoded.head()

Unnamed: 0,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,28,43,1,1440,47,1,0,2,0,38,0
1,28,37,1,24,37,0,0,2,0,24,1
2,72,12,1,1699,16,0,1,2,0,17,2
3,71,21,1,1389,4,1,0,2,0,50,1
4,50,29,1,474,5,0,0,2,0,29,2


## Create/Train Classifier

Create training sets:

In [8]:
X_train, X_test, y_train, y_test = train_test_split(input_features_encoded, expected, random_state=1)

Create model and train:

In [9]:
model = tree.DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Visualize Results

In [10]:
y_predict = model.predict(X_test)
"Accuracy Score: " + str(accuracy_score(y_test.as_matrix(), y_predict))


'Accuracy Score: 0.948905109489'

Confusion Matrix:

In [11]:
pd.DataFrame(
        confusion_matrix(y_test, y_predict),
        columns=['Predicted 1', 'Predicted 2'],
        index=['Actual 1', 'Actual 2']
    )

Unnamed: 0,Predicted 1,Predicted 2
Actual 1,780,0
Actual 2,42,0


Generate tree map:

In [12]:
tree.export_graphviz(model, out_file='manner_of_death_classification.dot', feature_names=input_features.columns)
call(['dot', '-T', 'png', './manner_of_death_classification.dot', '-o', './manner_of_death_classification.png'])

0

Translate Split 'state_pop' Nodes of Interest:

In [13]:
le = LabelEncoder()
le.fit(input_features['armed'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{'BB gun': 0,
 'Taser': 1,
 'air conditioner': 2,
 'ax': 3,
 'baseball bat': 4,
 'baseball bat and bottle': 5,
 'baseball bat and fireplace poker': 6,
 'baton': 7,
 'bayonet': 8,
 'bean-bag gun': 9,
 'beer bottle': 10,
 'blunt object': 11,
 'box cutter': 12,
 'brick': 13,
 'carjack': 14,
 'chain': 15,
 'chain saw': 16,
 'chainsaw': 17,
 'chair': 18,
 "contractor's level": 19,
 'cordless drill': 20,
 'crossbow': 21,
 'crowbar': 22,
 'fireworks': 23,
 'flagpole': 24,
 'flashlight': 25,
 'garden tool': 26,
 'glass shard': 27,
 'gun': 28,
 'gun and car': 29,
 'gun and knife': 30,
 'gun and sword': 31,
 'guns and explosives': 32,
 'hammer': 33,
 'hand torch': 34,
 'hatchet': 35,
 'hatchet and gun': 36,
 'incendiary device': 37,
 'knife': 38,
 'lawn mower blade': 39,
 'machete': 40,
 'machete and gun': 41,
 'meat cleaver': 42,
 'metal hand tool': 43,
 'metal object': 44,
 'metal pipe': 45,
 'metal pole': 46,
 'metal rake': 47,
 'metal stick': 48,
 'motorcycle': 49,
 'nail gun': 50,
 'oar': 5

# Profiling

In [14]:
pandas_profiling.ProfileReport(datapoints)

0,1
Number of variables,13
Number of observations,3285
Total Missing (%),0.0%
Total size in memory,288.8 KiB
Average record size in memory,90.0 B

0,1
Numeric,4
Categorical,7
Boolean,2
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,6
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.6362
Minimum,0
Maximum,5
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,1
Q3,3
95-th percentile,3
Maximum,5
Range,5
Interquartile range,2

0,1
Standard deviation,1.0686
Coef of variation,0.65311
Kurtosis,-0.022732
Mean,1.6362
MAD,0.91823
Skewness,0.64613
Sum,5375
Variance,1.142
Memory size,25.7 KiB

Value,Count,Frequency (%),Unnamed: 3
1,1563,0.0%,
3,779,0.0%,
2,546,0.0%,
0,314,0.0%,
5,51,0.0%,
4,32,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,314,0.0%,
1,1563,0.0%,
2,546,0.0%,
3,779,0.0%,
4,32,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1563,0.0%,
2,546,0.0%,
3,779,0.0%,
4,32,0.0%,
5,51,0.0%,

0,1
Distinct count,77
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,36.784
Minimum,6
Maximum,91
Zeros (%),0.0%

0,1
Minimum,6
5-th percentile,19
Q1,27
Median,35
Q3,45
95-th percentile,60
Maximum,91
Range,85
Interquartile range,18

0,1
Standard deviation,12.801
Coef of variation,0.34801
Kurtosis,0.21397
Mean,36.784
MAD,10.214
Skewness,0.72535
Sum,120840
Variance,163.87
Memory size,25.7 KiB

Value,Count,Frequency (%),Unnamed: 3
25.0,130,0.0%,
31.0,113,0.0%,
32.0,108,0.0%,
36.0,107,0.0%,
24.0,104,0.0%,
36.8331303289,104,0.0%,
27.0,104,0.0%,
33.0,101,0.0%,
35.0,100,0.0%,
29.0,100,0.0%,

Value,Count,Frequency (%),Unnamed: 3
6.0,2,0.0%,
12.0,1,0.0%,
13.0,1,0.0%,
14.0,2,0.0%,
15.0,11,0.0%,

Value,Count,Frequency (%),Unnamed: 3
82.0,1,0.0%,
83.0,2,0.0%,
84.0,1,0.0%,
86.0,2,0.0%,
91.0,1,0.0%,

0,1
Distinct count,76
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
gun,1843
knife,480
unarmed,232
Other values (73),730

Value,Count,Frequency (%),Unnamed: 3
gun,1843,0.0%,
knife,480,0.0%,
unarmed,232,0.0%,
vehicle,230,0.0%,
toy weapon,124,0.0%,
undetermined,113,0.0%,
unknown weapon,31,0.0%,
machete,30,0.0%,
Taser,13,0.0%,
ax,12,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.10594

0,1
False,2937
True,348

Value,Count,Frequency (%),Unnamed: 3
False,2937,0.0%,
True,348,0.0%,

0,1
Distinct count,1744
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Los Angeles,51
Phoenix,45
Houston,33
Other values (1741),3156

Value,Count,Frequency (%),Unnamed: 3
Los Angeles,51,0.0%,
Phoenix,45,0.0%,
Houston,33,0.0%,
Columbus,27,0.0%,
Las Vegas,26,0.0%,
Chicago,25,0.0%,
San Antonio,24,0.0%,
Austin,22,0.0%,
Albuquerque,20,0.0%,
Oklahoma City,19,0.0%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Not fleeing,2201
Car,541
Foot,424

Value,Count,Frequency (%),Unnamed: 3
Not fleeing,2201,0.0%,
Car,541,0.0%,
Foot,424,0.0%,
Other,119,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
M,3142
F,143

Value,Count,Frequency (%),Unnamed: 3
M,3142,0.0%,
F,143,0.0%,

0,1
Distinct count,3285
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1674.2
Minimum,0
Maximum,3398
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,164.2
Q1,822.0
Median,1660.0
Q3,2522.0
95-th percentile,3220.8
Maximum,3398.0
Range,3398.0
Interquartile range,1700.0

0,1
Standard deviation,979.86
Coef of variation,0.58526
Kurtosis,-1.1949
Mean,1674.2
MAD,847.73
Skewness,0.030801
Sum,5499870
Variance,960130
Memory size,25.7 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
557,1,0.0%,
577,1,0.0%,
2624,1,0.0%,
573,1,0.0%,
2620,1,0.0%,
569,1,0.0%,
2616,1,0.0%,
565,1,0.0%,
2612,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
3394,1,0.0%,
3395,1,0.0%,
3396,1,0.0%,
3397,1,0.0%,
3398,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
shot,3092
shot and Tasered,193

Value,Count,Frequency (%),Unnamed: 3
shot,3092,0.0%,
shot and Tasered,193,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.24414

0,1
False,2483
True,802

Value,Count,Frequency (%),Unnamed: 3
False,2483,0.0%,
True,802,0.0%,

0,1
Distinct count,51
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
CA,525
TX,289
FL,199
Other values (48),2272

Value,Count,Frequency (%),Unnamed: 3
CA,525,0.0%,
TX,289,0.0%,
FL,199,0.0%,
AZ,161,0.0%,
CO,103,0.0%,
OK,102,0.0%,
GA,101,0.0%,
OH,99,0.0%,
NC,91,0.0%,
MO,82,0.0%,

0,1
Distinct count,51
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,14275000
Minimum,583029
Maximum,38654206
Zeros (%),0.0%

0,1
Minimum,583029
5-th percentile,1635500
Q1,4834600
Median,8310300
Q3,19934000
95-th percentile,38654000
Maximum,38654206
Range,38071177
Interquartile range,15100000

0,1
Standard deviation,12763000
Coef of variation,0.89409
Kurtosis,-0.57134
Mean,14275000
MAD,10898000
Skewness,0.95686
Sum,46893383984
Variance,1.629e+14
Memory size,25.7 KiB

Value,Count,Frequency (%),Unnamed: 3
38654206,525,0.0%,
26956435,289,0.0%,
19934451,199,0.0%,
6728577,161,0.0%,
5359295,103,0.0%,
3875589,102,0.0%,
10099320,101,0.0%,
11586941,99,0.0%,
9940828,91,0.0%,
6059651,82,0.0%,

Value,Count,Frequency (%),Unnamed: 3
583029,10,0.0%,
626249,6,0.0%,
659009,11,0.0%,
736162,7,0.0%,
736855,22,0.0%,

Value,Count,Frequency (%),Unnamed: 3
12851684,73,0.0%,
19697457,57,0.0%,
19934451,199,0.0%,
26956435,289,0.0%,
38654206,525,0.0%,

0,1
Distinct count,3
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
attack,2089
other,1058
undetermined,138

Value,Count,Frequency (%),Unnamed: 3
attack,2089,0.0%,
other,1058,0.0%,
undetermined,138,0.0%,

Unnamed: 0,manner_of_death,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,shot,gun,53.0,M,Shelton,WA,True,attack,Not fleeing,False,7073146,0
1,shot,gun,47.0,M,Aloha,OR,False,attack,Not fleeing,False,3982267,1
2,shot and Tasered,unarmed,23.0,M,Wichita,KS,False,other,Not fleeing,False,2898292,2
3,shot,toy weapon,32.0,M,San Francisco,CA,True,attack,Not fleeing,False,38654206,1
4,shot,nail gun,39.0,M,Evans,CO,False,attack,Not fleeing,False,5359295,2
