## Imports

In [225]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import seaborn as sns
from subprocess import call
from collections import defaultdict

## Constants

In [228]:
TRAINING_DATA = '../datasets/fatal.csv'
d = defaultdict(LabelEncoder)

## Wrangle Data

In [205]:
datapoints = pd.read_csv(TRAINING_DATA)
datapoints = datapoints.drop('Unnamed: 0', axis=1)\
                       .drop('Ethnicity', axis=1)\
                       .drop('name', axis=1)\
                       .drop('date', axis=1)\
                       .dropna()
input_features = datapoints.drop('race', axis=1)
expected = datapoints['race']


In [206]:
input_features.head()


Unnamed: 0,id,manner_of_death,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop
0,3,shot,gun,53.0,M,Shelton,WA,True,attack,Not fleeing,False,7073146
1,4,shot,gun,47.0,M,Aloha,OR,False,attack,Not fleeing,False,3982267
2,5,shot and Tasered,unarmed,23.0,M,Wichita,KS,False,other,Not fleeing,False,2898292
3,8,shot,toy weapon,32.0,M,San Francisco,CA,True,attack,Not fleeing,False,38654206
4,9,shot,nail gun,39.0,M,Evans,CO,False,attack,Not fleeing,False,5359295


In [207]:
expected.head()

0    A
1    W
2    H
3    W
4    H
Name: race, dtype: object

Encode Data:

In [229]:
# input_features_encoded = input_features.apply(LabelEncoder().fit_transform)
input_features_encoded = input_features.apply(lambda x: d[x.name].fit_transform(x))

input_features_encoded.head()

Unnamed: 0,id,manner_of_death,armed,age,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop
0,0,0,28,43,1,1341,47,1,0,2,0,38
1,1,0,28,37,1,22,37,0,0,2,0,24
2,2,1,71,12,1,1584,16,0,1,2,0,17
3,3,0,70,21,1,1293,4,1,0,2,0,50
4,4,0,50,29,1,438,5,0,0,2,0,29


## Create/Train Classifier

Create training sets:

In [209]:
X_train, X_test, y_train, y_test = train_test_split(input_features_encoded, expected, random_state=1)

Create model and train:

In [242]:
model = tree.DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Visualize Results

In [243]:
y_predict = model.predict(X_test)
"Accuracy Score: " + str(accuracy_score(y_test.as_matrix(), y_predict))


'Accuracy Score: 0.525132275132'

Confusion Matrix:

In [244]:
pd.DataFrame(
        confusion_matrix(y_test, y_predict),
        columns=['Predicted 1', 'Predicted 2', 'Predicted 3',\
                 'Predicted 4', 'Predicted 5', 'Predicted 6'],
        index=['Actual 1', 'Actual 2', 'Actual 3',\
               'Actual 4', 'Actual 5', 'Actual 6']
    )

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6
Actual 1,0,1,4,0,0,6
Actual 2,0,32,39,0,0,129
Actual 3,0,4,53,0,0,86
Actual 4,0,0,0,0,0,11
Actual 5,0,2,3,0,0,6
Actual 6,0,26,42,0,0,312


Generate tree map:

In [245]:
tree.export_graphviz(model, out_file='race_classification.dot', feature_names=input_features.columns)
call(['dot', '-T', 'png', './race_classification.dot', '-o', './race_classification.png'])

0

Translate Split 'state_pop' Nodes of Interest:

In [246]:
le = LabelEncoder()
le.fit(input_features['state_pop'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{583029: 0,
 626249: 1,
 659009: 2,
 736162: 3,
 736855: 4,
 851058: 5,
 934695: 6,
 1023391: 7,
 1054491: 8,
 1327503: 9,
 1329923: 10,
 1413673: 11,
 1635483: 12,
 1846092: 13,
 1881259: 14,
 2082669: 15,
 2839172: 16,
 2898292: 17,
 2948427: 18,
 2968472: 19,
 2989192: 20,
 3106589: 21,
 3588570: 22,
 3875589: 23,
 3982267: 24,
 4411989: 25,
 4645670: 26,
 4834605: 27,
 4841164: 28,
 5359295: 29,
 5450868: 30,
 5754798: 31,
 5959902: 32,
 6059651: 33,
 6548009: 34,
 6589578: 35,
 6728577: 36,
 6742143: 37,
 7073146: 38,
 8310301: 39,
 8915456: 40,
 9909600: 41,
 9940828: 42,
 10099320: 43,
 11586941: 44,
 12783977: 45,
 12851684: 46,
 19697457: 47,
 19934451: 48,
 26956435: 49,
 38654206: 50}