#  Understanding Police Fatal Shootings

Notebook for analyzing police fatal shooting factors related to race.

Reference: bit.ly/mui-asdrp

Data Download and Pre-Processing from [Mapping Police Violence](https://mappingpoliceviolence.us/s/MPVDatasetDownload.xlsx)

In [None]:
#importing useful libraries
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../data/police-violence.csv")

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
df = df[['name', 'age', 'gender', 'race', 'date', 'city', 'state', 'zip', 'county', 'agency_responsible', 'cause_of_death', 'counter_type', 'initial_reason']]

In [None]:
df.tail(2)

In [None]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
#creates a one-hot encoder function which returns a column of our dataframe one-hot encodeded as well as their categorical name (given input of the column) 

def one_hot_encode(data):
  values = array(data)
  label_encoder = LabelEncoder()
  integer_encoded = label_encoder.fit_transform(values)
  onehot_encoder = OneHotEncoder(sparse=False)
  integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
  onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
  shape = onehot_encoded.shape
  inverted = []
  for i in range(shape[-1]):
      inverted.append(label_encoder.inverse_transform([i]))
  return onehot_encoded, inverted

In [None]:
# declares output array
outputs = [[] for i in range(len(df))] 

In [None]:
#creates inputs [1,0] = African American Killing by Police, [0,1] = non African American Killing by Police
races = df["race"] 
count = 0
for race in races:
  if(race == "Black"):
    outputs[count].append(1)
    outputs[count].append(0)
  else:
    outputs[count].append(0)
    outputs[count].append(1)
  count += 1
df = df.drop(columns = "race")

In [None]:
xOutputs = []
for race in races:
  if(race == "Black"):
    xOutputs.append(1)
  else:
    xOutputs.append(0)

In [None]:
len(xOutputs)

In [None]:
# creates inputs by one-hot encoding individual dataframe columns
# inverted is a list of categorical data which corresponds to each "1" in the one-hot encoding
inputs = list()
inverted = []
for column in df: 
  check = array(df[column])
  encodedArray = one_hot_encode(check)[0]
  inverted.extend(one_hot_encode(check)[1])
  inputs.append(encodedArray)
inputs = np.hstack(inputs)


In [None]:
# code for undersampling
#import random
#temp = list(zip(inputs, outputs)) 
#random.shuffle(temp) 
#inputs, outputs = zip(*temp)

In [None]:
#count = 0
#countx = 0
#for i in range(len(outputs)):
#  if(outputs[i][0] == 1):
#    count += 1
#  if(outputs[i][0] == 0):
#    countx += 1
#check = 0
#newInputs = []
#newOutputs = []
#inputs = list(inputs)
#outputs = list(outputs)
#for i in range(len(inputs) - 1):
#  if(check < abs(count-countx)):
#    if(outputs[i][1] == 0):
#      newInputs.append(inputs[i])
#      newOutputs.append(outputs[i])
#    else:
#      check += 1
#  else:
#    newInputs.append(inputs[i])
#    newOutputs.append(outputs[i])

In [None]:
#import random
#temp = list(zip(newInputs, newOutputs)) 
#random.shuffle(temp) 
#newInputs, newOutputs = zip(*temp)

In [None]:
len(inputs)

In [None]:
len(xOutputs)

In [None]:
# summarize class distribution
print(Counter(xOutputs))

In [None]:
import sklearn.utils._cython_blas
import imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
oversample = RandomOverSampler(sampling_strategy='minority')
newInputs, newOutputs = oversample.fit_resample(inputs, xOutputs)

# summarize class distribution
print(Counter(newOutputs))

In [None]:
import random
temp = list(zip(newInputs, newOutputs)) 
random.shuffle(temp) 
newInputs, newOutputs = zip(*temp)

In [None]:
# creates a train/test split of 80-20
newOutputs = np.array(newOutputs)
split = int(0.7*len(newInputs))
X_train, X_test, y_train, y_test = newInputs[:split], newInputs[split:], newOutputs[:split], newOutputs[split:]
X_train = np.expand_dims(X_train, axis = 2)
X_test = np.expand_dims(X_test, axis = 2)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, LSTM, Flatten, Conv1D
#earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, restore_best_weights= True)
model = Sequential()
model.add(Flatten())
model.add(Dense(2000, activation = 'relu'))
model.add(Dropout(.3))
model.add(Dense(200, activation = 'relu'))
model.add(Dropout(.3))
model.add(Dense(1, activation = 'relu'))
model.compile(loss = "MSE", optimizer = "Adam", metrics=["accuracy"])

In [None]:
# fits the model to the training data
# it is extremely important to use class_weights as otherwise, the model only predicts one class (as 68% of the inputs are non-African American Killings)
# this ensures that our model is fair and treats the individual accuracy of each class' classification the same
model.fit(X_train, y_train,
            batch_size=16,
            epochs= 20,
            validation_split = 0.2, 
            )

In [None]:
model.summary()

In [None]:
from sklearn.metrics import f1_score
preds = model.predict(X_test)
check = 0
num_correct = 0
for i in range(len(preds)):
  if(preds[i] < 0.5):
    preds[i] = 0
  else:
    preds[i] = 1      
for i in range(len(preds)):
  if(preds[i].round() == y_test[i]):
    num_correct += 1
print("The fraction of correctly classified examples in the test set is: " + str(num_correct / len(preds)))
f1_score(y_test, preds, average = None)

In [None]:
# creates a train/test split of 80-20
newOutputs = np.array(newOutputs)
split = int(0.8*len(newInputs))
X_train, X_test, y_train, y_test = newInputs[:split], newInputs[split:], newOutputs[:split], newOutputs[split:]

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.metrics import f1_score
model = tree.DecisionTreeClassifier(max_depth= 5000)
model.fit(X_train, y_train)
preds = model.predict(X_test)
check = 0
num_correct = 0
for i in range(len(preds)):
  if(preds[i] < 0.5):
    preds[i] = 0
  else:
    preds[i] == 1
for i in range(len(preds)):
  if(preds[i].round() == y_test[i]):
    num_correct += 1
print("The fraction of correctly classified examples in the test set is: " + str(num_correct / len(preds)))
f1_score(y_test, preds, average = None)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
model = RandomForestClassifier(300)
model.fit(X_train, y_train)
preds = model.predict(X_test)
check = 0
num_correct = 0
for i in range(len(preds)):
  if(preds[i] < 0.5):
    preds[i] = 0
  else:
    preds[i] == 1
for i in range(len(preds)):
  if(preds[i].round() == y_test[i]):
    num_correct += 1
print("The fraction of correctly classified examples in the test set is: " + str(num_correct / len(preds)))
f1_score(y_test, preds, average = None)

In [None]:
! pip install graphviz

# if you are using mac:
! brew install graphviz

# if you are using Ubuntu
! sudo apt-get install graphivz

# if you are using Windows
! echo "You need to install GraphViz executable from web first"

In [None]:
import graphviz
model = tree.DecisionTreeClassifier(max_depth= 4)
model.fit(X_train, y_train)

In [None]:
dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=inverted,  
                                class_names= ["AA", "nonAA"],
                                filled=True)

In [None]:
graph = graphviz.Source(dot_data, format="png") 

In [None]:
import pydotplus
pydot_graph = pydotplus.graph_from_dot_data(dot_data)

In [None]:
from IPython.display import Image
Image(pydot_graph.create_png())

In [None]:
graph = graphviz.Source(dot_data, format="png") 
graph

In [None]:
from sklearn.metrics import accuracy_score
accuracies = []
medAccuracies = []
depths = []

# this takes many HOURS -- be ready to get 8+ hours sleep!
for i in range(5,500, 5):
  print(i)
  depths.append(i)
  model = RandomForestClassifier(300, max_depth=i)
  model.fit(X_train, y_train)
  estimatorAccuracy=[]
  for curEstimator in range(300):
    estimatorAccuracy.append([curEstimator,accuracy_score(y_test, model.estimators_[curEstimator].predict(X_test))])

  estimatorAccuracy=pd.DataFrame(estimatorAccuracy,columns=['estimatorNumber','Accuracy'])
  estimatorAccuracy.sort_values(inplace=True,by='Accuracy',ascending=False)
  estimatorAccuracy = estimatorAccuracy.reset_index(drop=True)

  bestAccuracy= estimatorAccuracy['Accuracy'][0]
  accuracies.append(bestAccuracy)
  medAccuracies.append(estimatorAccuracy['Accuracy'][(len(estimatorAccuracy)/2) + 1])


estimatorAccuracy.head()

In [None]:
plt.title("Various Accuracies of Decision Trees in the Random Forest Based On Depth")
plt.plot(depths, accuracies, label = "Accuracy of the Most Accurate Decision Tree in the Random Forest")
plt.plot(depths, medAccuracies, label = "Median Accuracy of the Decision Trees in the Random Forest")

plt.xlabel("Maximum Depth")
plt.ylabel("Accuracy")
plt.legend()

In [None]:
import matplotlib.pyplot as plt # has a different graph nearly every time, but the range of accuracies is always similar, which is what the graph is used for
answers = []
for i in range(100):
  model = RandomForestClassifier(300) 
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  check = 0
  num_correct = 0
  for i in range(len(preds)):
    if(preds[i] < 0.5):
      preds[i] = 0
    else:
      preds[i] == 1
  for i in range(len(preds)):
    if(preds[i].round() == y_test[i]):
      num_correct += 1
  answers.append(num_correct / len(preds))

plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})
plt.hist(answers, bins=20)
plt.gca().set(title='Accuracy Distribution', ylabel='Frequency');

In [None]:
from sklearn import svm
from sklearn.metrics import f1_score
model = svm.SVC()
model.fit(X_train, y_train)
preds = model.predict(X_test)
check = 0
num_correct = 0
for i in range(len(preds)):
  if(preds[i] < 0.5):
    preds[i] = 0
  else:
    preds[i] == 1
for i in range(len(preds)):
  if(preds[i].round() == y_test[i]):
    num_correct += 1
print("The fraction of correctly classified examples in the test set is: " + str(num_correct / len(preds)))
f1_score(y_test, preds, average = None)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
model = GradientBoostingClassifier(n_estimators = 300, max_depth= None)
model.fit(X_train, y_train)
preds = model.predict(X_test)
check = 0
num_correct = 0
for i in range(len(preds)):
  if(preds[i] < 0.5):
    preds[i] = 0
  else:
    preds[i] == 1
for i in range(len(preds)):
  if(preds[i].round() == y_test[i]):
    num_correct += 1
print("The fraction of correctly classified examples in the test set is: " + str(num_correct / len(preds)))
f1_score(y_test, preds, average = None)

In [None]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import f1_score
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)
check = 0
num_correct = 0
for i in range(len(preds)):
  if(preds[i] < 0.5):
    preds[i] = 0
  else:
    preds[i] == 1
for i in range(len(preds)):
  if(preds[i].round() == y_test[i]):
    num_correct += 1
print("The fraction of correctly classified examples in the test set is: " + str(num_correct / len(preds)))
f1_score(y_test, preds, average = None)