## Imports

In [1]:
# %matplotlib inline
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import seaborn as sns
from subprocess import call
from collections import defaultdict


## Constants

In [7]:
TRAINING_DATA = '../datasets/fatalFinal.csv'
d = defaultdict(LabelEncoder)

## Wrangle Data

In [17]:
datapoints = pd.read_csv(TRAINING_DATA)
datapoints = datapoints.drop('Unnamed: 0', axis=1)\
                       .drop('name', axis=1)\
                       .drop('date', axis=1)\
                       .drop('race', axis=1)\
                       .drop('id', axis=1)\
                       .drop('age', axis=1)\
                       .dropna()
input_features = datapoints.drop('ageRange', axis=1)
expected = datapoints['ageRange']


In [18]:
input_features.head()


Unnamed: 0,manner_of_death,armed,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,shot,gun,M,Shelton,WA,True,attack,Not fleeing,False,7073146,0
1,shot,gun,M,Aloha,OR,False,attack,Not fleeing,False,3982267,1
2,shot and Tasered,unarmed,M,Wichita,KS,False,other,Not fleeing,False,2898292,2
3,shot,toy weapon,M,San Francisco,CA,True,attack,Not fleeing,False,38654206,1
4,shot,nail gun,M,Evans,CO,False,attack,Not fleeing,False,5359295,2


In [19]:
expected.head()

0    50-60
1    40-50
2    20-30
3    30-40
4    30-40
Name: ageRange, dtype: object

Encode Data:

In [20]:
# input_features_encoded = input_features.apply(LabelEncoder().fit_transform)
input_features_encoded = input_features.apply(lambda x: d[x.name].fit_transform(x))

input_features_encoded.head()

Unnamed: 0,manner_of_death,armed,gender,city,state,signs_of_mental_illness,threat_level,flee,body_camera,state_pop,Ethnicity
0,0,28,1,1440,47,1,0,2,0,38,0
1,0,28,1,24,37,0,0,2,0,24,1
2,1,72,1,1699,16,0,1,2,0,17,2
3,0,71,1,1389,4,1,0,2,0,50,1
4,0,50,1,474,5,0,0,2,0,29,2


## Create/Train Classifier

Create training sets:

In [21]:
X_train, X_test, y_train, y_test = train_test_split(input_features_encoded, expected, random_state=1)

Create model and train:

In [22]:
model = tree.DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## Visualize Results

In [23]:
y_predict = model.predict(X_test)
"Accuracy Score: " + str(accuracy_score(y_test.as_matrix(), y_predict))


'Accuracy Score: 0.335766423358'

Confusion Matrix:

In [24]:
pd.DataFrame(
        confusion_matrix(y_test, y_predict),
        columns=['Predicted 1', 'Predicted 2', 'Predicted 3',\
                 'Predicted 4', 'Predicted 5', 'Predicted 6'],
        index=['Actual 1', 'Actual 2', 'Actual 3',\
               'Actual 4', 'Actual 5', 'Actual 6']
    )

Unnamed: 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4,Predicted 5,Predicted 6
Actual 1,0,25,21,0,0,0
Actual 2,0,82,139,0,0,0
Actual 3,0,76,194,0,0,0
Actual 4,0,36,94,0,0,0
Actual 5,0,13,94,0,0,0
Actual 6,0,3,45,0,0,0


Generate tree map:

In [25]:
tree.export_graphviz(model, out_file='age_classification.dot', feature_names=input_features.columns)
call(['dot', '-T', 'png', './age_classification.dot', '-o', './age_classification.png'])

0

Translate Split 'state_pop' Nodes of Interest:

In [26]:
le = LabelEncoder()
le.fit(input_features['Ethnicity'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}

In [None]:
## Profiling