In [36]:
# "jupyter notebook" in anaconda terminal
# This is a script that trains a Decision Tree Classifier on a dataset of murder records 
# to predict the race of a murderer based on the race of their victim. 
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

#1. import data
murder_data = pd.read_csv('database.csv', low_memory=False)

#2. clean data
murder_data.rename(columns={'Record ID':u'Record_ID','Agency Code':u'Agency_Code','Agency Name':u'Agency_Name','Agency Type':u'Agency_Type','Crime Type':u'Crime_Type',
                   'Crime Solved':u'Crime_Solved', 'Victim Sex':u'Victim_sex', 'Victim Age':u'Victim_Age','Victim Ethnicity':u'Victim_Ethnicity','Victim Race':u'Victim_Race',
                   'Perpetrator Age':u'Perpetrator_Age','Perpetrator Sex':'Perpetrator_Sex','Perpetrator Race':u'Perpetrator_Race','Perpetrator Ethnicity':u'Perpetrator_Ethnicity','Victim Count':u'Victim_Count',
                  'Perpetrator Count':u'Perpetrator_Count', 'Record Source':u'Record_Source'},inplace=True)

# split the data set into in- and output. in this case Victim Race and Perpetrator Race
# races are "White", "Black", "Unknown", "Asian/Pacific Islander" and "Native American/Alaska Native"
# encode races to numbers so it can be used by the algorithm
# White 0, Black 1, Unknown 2, Asian/pac 3, Native 4
le_victim = LabelEncoder()
murder_data['Victim_Race']= le_victim.fit_transform(murder_data['Victim_Race'])

le_perp = LabelEncoder()
murder_data['Perpetrator_Race'] = le_perp.fit_transform(murder_data['Perpetrator_Race'])

le_victim.transform(le_victim.classes_)
le_perp.transform(le_perp.classes_)

X = murder_data[['Victim_Race']]
y = murder_data['Perpetrator_Race']

#3. split data into training/test (returns a tuple)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# changing the shape of y to (n_samples, )
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

#4. create a model
model = DecisionTreeClassifier()

#5. train the model
model.fit(X_train, y_train)

#6. make predicc
predictions = model.predict(X_test)

#7. eval n improve
score = accuracy_score(y_test, predictions)
score







0.6074351363839269

In [51]:
murder_data[murder_data["Perpetrator_Sex"]=="Male"].Perpetrator_Race.value_counts(normalize=True)

# White 0, Black 1, Unknown 2, Asian/pac 3, Native 4

4    0.490155
1    0.474885
3    0.013771
0    0.013638
2    0.007551
Name: Perpetrator_Race, dtype: float64

In [46]:
murder_data[murder_data["Perpetrator_Sex"]=="Female"].Perpetrator_Race.value_counts(normalize=True)

# White 0, Black 1, Unknown 2, Asian/pac 3, Native 4


1    0.507704
4    0.460204
2    0.011906
0    0.011885
3    0.008301
Name: Perpetrator_Race, dtype: float64