In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from preprocess import *


map_data = pd.read_csv("Murder_Data/SHR76_20.csv")

In [3]:
data_cleaned = clean_dataframe(map_data)

In [4]:
data_cleaned

Unnamed: 0,ID,CNTYFIPS,Ori,State,Agency,Agentype,Source,Solved,Year,Month,...,OffRace,OffEthnic,Weapon,Relationship,Circumstance,Subcircum,VicCount,OffCount,FileDate,MSA
1,197701001AL00400,"Autauga, AL",AL00400,Alabama,Autauga County,Sheriff,FBI,Yes,1977,January,...,Black,Unknown,Unknown,Acquaintance,Brawl due to influence of alcohol,Unknown,0,0,30180.0,"Montgomery, AL"
2,197703001AL00400,"Autauga, AL",AL00400,Alabama,Autauga County,Sheriff,FBI,Yes,1977,March,...,White,Unknown,"Handgun - pistol, revolver, etc",Acquaintance,Other arguments,Unknown,0,1,30180.0,"Montgomery, AL"
3,197703001AL00401,"Autauga, AL",AL00401,Alabama,Prattville,Municipal police,FBI,Yes,1977,March,...,Black,Unknown,Shotgun,Husband,Other arguments,Unknown,0,0,30180.0,"Montgomery, AL"
4,197708001AL00400,"Autauga, AL",AL00400,Alabama,Autauga County,Sheriff,FBI,Yes,1977,August,...,Black,Unknown,Knife or cutting instrument,Acquaintance,Unknown,Unknown,0,0,30180.0,"Montgomery, AL"
5,197710001AL00400,"Autauga, AL",AL00400,Alabama,Autauga County,Sheriff,FBI,Yes,1977,October,...,Black,Unknown,Shotgun,Stranger,Robbery,Unknown,0,0,30180.0,"Montgomery, AL"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827213,198112001WY02300,"Weston, WY",WY02300,Wyoming,Weston County,Sheriff,FBI,Yes,1981,December,...,White,Not of Hispanic origin,"Handgun - pistol, revolver, etc",Neighbor,Other arguments,Unknown,0,0,22482.0,Rural Wyoming
827214,198210001WY02300,"Weston, WY",WY02300,Wyoming,Weston County,Sheriff,FBI,Yes,1982,October,...,White,Not of Hispanic origin,"Personal weapons, includes beating",Girlfriend,Other arguments,Unknown,0,0,121482.0,Rural Wyoming
827215,198701001WY02300,"Weston, WY",WY02300,Wyoming,Weston County,Sheriff,FBI,Yes,1987,January,...,White,Unknown,Asphyxiation - includes death by gas,Son,All other manslaughter by negligence,Unknown,0,0,61987.0,Rural Wyoming
827216,199412001WY02300,"Weston, WY",WY02300,Wyoming,Weston County,Sheriff,FBI,Yes,1994,December,...,Asian,Unknown,Unknown,Acquaintance,All other manslaughter by negligence,Unknown,0,0,22795.0,Rural Wyoming


In [5]:
#the tree model can not handle string values
data_cleaned = data_cleaned[(data_cleaned['VicAge'].astype(str)!='Unknown') & (data_cleaned['OffAge'].astype(str)!='Unknown')]
data_cleaned['VicAge'] = pd.to_numeric(data_cleaned['VicAge'])
data_cleaned['OffAge'] = pd.to_numeric(data_cleaned['OffAge'])

x = data_cleaned[['State','VicAge','VicSex','VicRace','Weapon','Subcircum']]
x['VicAge'] = pd.to_numeric(x['VicAge'])
y = data_cleaned[['OffAge','OffRace','OffSex','OffEthnic']]

print(x.head())
print(y.head())

#create dummy variables: https://datascience.stackexchange.com/questions/5226/strings-as-features-in-decision-tree-random-forest
x = pd.get_dummies(x,columns=["State",'VicSex','VicRace','Weapon','Subcircum'])
y = pd.get_dummies(y,columns=["OffRace","OffSex","OffEthnic"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=20)

     State  VicAge  VicSex VicRace                           Weapon Subcircum
1  Alabama      65  Female   Black                          Unknown   Unknown
2  Alabama      48    Male   White  Handgun - pistol, revolver, etc   Unknown
3  Alabama      27    Male   Black                          Shotgun   Unknown
4  Alabama      17  Female   Black      Knife or cutting instrument   Unknown
5  Alabama      62    Male   Asian                          Shotgun   Unknown
   OffAge OffRace  OffSex OffEthnic
1      62   Black    Male   Unknown
2      52   White    Male   Unknown
3      22   Black  Female   Unknown
4      21   Black    Male   Unknown
5      80   Black    Male   Unknown


## Decision Tree
https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression_multioutput.html#sphx-glr-auto-examples-tree-plot-tree-regression-multioutput-py

In [6]:
td3 = DecisionTreeRegressor(max_depth=5)
td3.fit(x_train, y_train)

DecisionTreeRegressor(max_depth=5)

In [7]:
td3.score(x_test,y_test)

0.036728133075331314

In [8]:
x = data_cleaned[['VicAge','VicSex','VicRace','Weapon']]
x.loc[x['Weapon'].str.contains("gun|pistol|Rifle|Firearm",
                            regex=True), 'Weapon'] = 'firearm'
x['VicAge'] = pd.to_numeric(x['VicAge'])
y = data_cleaned[['OffAge','OffRace','OffSex']]

print(x.head())
print(y.head())

#create dummy variables: https://datascience.stackexchange.com/questions/5226/strings-as-features-in-decision-tree-random-forest
x = pd.get_dummies(x,columns=['VicSex','VicRace','Weapon'])
y = pd.get_dummies(y,columns=["OffRace","OffSex"])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=20)
x_train

   VicAge  VicSex VicRace                       Weapon
1      65  Female   Black                      Unknown
2      48    Male   White                      firearm
3      27    Male   Black                      firearm
4      17  Female   Black  Knife or cutting instrument
5      62    Male   Asian                      firearm
   OffAge OffRace  OffSex
1      62   Black    Male
2      52   White    Male
3      22   Black  Female
4      21   Black    Male
5      80   Black    Male


Unnamed: 0,VicAge,VicSex_Female,VicSex_Male,VicRace_American Indian or Alaskan Native,VicRace_Asian,VicRace_Black,VicRace_Native Hawaiian or Pacific Islander,VicRace_White,Weapon_Asphyxiation - includes death by gas,"Weapon_Blunt object - hammer, club, etc",...,Weapon_Explosives,Weapon_Fire,Weapon_Knife or cutting instrument,"Weapon_Narcotics or drugs, sleeping pills","Weapon_Personal weapons, includes beating",Weapon_Poison - does not include gas,Weapon_Pushed or thrown out window,Weapon_Strangulation - hanging,Weapon_Unknown,Weapon_firearm
323434,20,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
690202,19,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
245916,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
276371,35,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
750487,21,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79893,31,0,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
722549,1,0,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
5044,58,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
453954,34,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
td20 = DecisionTreeRegressor(max_depth=20)
td20.fit(x_train, y_train)
print(td20.score(x_train,y_train))
td20.score(x_test,y_test)
#ok sure; we got a higher accuracy.... but at what cost... if we plot the tree now it's going to look horrible:

0.3395726571520382


0.26664573689233534

better but not that great ... maybe look at 1 tree for each feature instead?

## Knn
from: https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html

In [14]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

knn = MultiOutputClassifier(KNeighborsClassifier()).fit(x_train, y_train)
#the scoring takes quite some time... so for now take first 1000 samples
knn.score(x_test[:1000],y_test[:1000])

0.026