In [7]:
import pandas as pd
import numpy as np
np.random.seed(0)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from time import time

# model: 'megan'
## random forest

In [2]:
print('importing')
df = pd.read_csv('merged.csv')
df = df.drop(['Unnamed: 0','state_y','state_x'], axis=1) # drop unneeded cols
# df = df.drop(to_drop,axis=1)
storage = df # store imported copy

importing


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print('dropping nulls')
df = df.dropna()

print('begin sample')
# sample down to 30k
df = df.sample(10000, random_state=1)
print('sample complete')
# drop nulls?


dropping nulls
begin sample
sample complete


In [4]:
# df with dummies
print('getting dummies')
# remove Y to avoid Y dummies
target = df['UCR_code']
df = df.drop('UCR_code',axis=1)
print('dropped y')


getting dummies
dropped y


In [5]:
# dummify

df = pd.get_dummies(df,drop_first=True)
print('dummies acquired')
features = list(df.columns) # get dummy list

df['UCR_code'] = target # this should replace the original undummied Y
print('done with imports & cleaning')

dummies acquired
done with imports & cleaning


In [6]:
df['UCR_code'] = target

In [10]:
df = df.drop('UCR_code',axis=1)

In [11]:
df.dtypes.value_counts()

uint8      11347
int64          5
float64        3
dtype: int64

In [12]:
# make training splits
data = df
#  = target
data_train, data_test, target_train, target_test = train_test_split(data, target, 
                                                                   test_size = 0.25, random_state=123)

In [13]:
# regular baseline tree
tree_clf = DecisionTreeClassifier(criterion = "gini", max_depth = 5) 
tree_clf.fit(data_train, target_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [14]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [15]:
# this is hard to interpret with 11000 columns
tree_clf.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [18]:
# graphing 11k columns isn't a superb idea
def plot_feature_importances(model):
    n_features = data_train.shape[1]
    plt.figure(figsize=(20,12))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), data_train.columns.values) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")

# plot_feature_importances(tree_clf)

In [19]:
# confusion matrix to see how the tree thinks
pred = tree_clf.predict(data_test)
print(confusion_matrix(target_test, pred))
print(classification_report(target_test, pred))

[[   0    0    0    0    0    0    0    0    0    7    0    0    0]
 [   0    0    0    0    0    0    0    0    0    2    0    0    0]
 [   0    0    0    1    0    0    0    0    2   31    0    0    0]
 [   0    0    0   83    0    0    0    0    0    3    0    0    0]
 [   0    0    0    0    0    0    0    0    0   19    0    0    0]
 [   0    0    0    0    0    0    0    0    0    9    0    0    0]
 [   0    0    0    1    0    0   56    0    0    0    0    0    0]
 [   0    0    0    0    0    0    0  127    3    3    0    8    0]
 [   0    0    0    1    0    0    0    1  461   19    0    2    0]
 [   0    0    0    0    0    0    0    1   12 1594    0    0    0]
 [   0    0    0    0    0    0    0    0    0    4    0    0    0]
 [   0    0    0    0    0    0    0    2   23   15    0    9    0]
 [   0    0    0    0    0    0    0    0    0    1    0    0    0]]
              precision    recall  f1-score   support

         09A       0.00      0.00      0.00         7
      

  'precision', 'predicted', average, warn_for)


In [20]:
# test accuracy
print("Testing Accuracy for Decision Tree Classifier: {:.4}%".format(accuracy_score(target_test, pred) * 100))


Testing Accuracy for Decision Tree Classifier: 93.2%


# random forest:

In [21]:
forest = RandomForestClassifier(n_estimators=100, max_depth= 5)
forest.fit(data_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
forest.score(data_train, target_train)

0.6342666666666666

In [24]:
forest.score(data_test, target_test)

0.6428

# interesting results: random forest outperforms Adaboost .64 to .40
# but RF had n=10,000 ; adaboost 5,000

## Re-binning

In [42]:
bins = {
    "Assault" : ['13A', '13B', '13C',],
    "Sex_Offense" : ['11A', '11B', '11C', '11D', '36A', '36B',],
    "Financial_Fraud_Gambling" : ['510', '250', '270', '210', '26A', '26B','26C', '26D', '26E', '39A', '39B', '39C', '39D'],
    "Theft" : ['220','23A','23B','23C','23D', '23E', '23F', '23G','23H', '240','120', '280'],
    "Arson" : ['200',],
    "Drugs" : ['35A', '35B',],
    "Murder" : ['09A'],
    "Negligent_Manslaughter" : ['09B'],
    "Justifiable_Homicide" : ['09C'],
    "Abduction" : ['100'],
    "Obscene_Prostitution" : ['370','40A','40B','40C',],
    "Weapon_Violation" : ['520'],
}

def cleaner(row): # if value in list, change value to that list's label
    entry = row['UCR_code']
    for key, value in bins.items():
        if entry in value:
            row['UCR_code'] = key
    return row
# josh = josh.apply(lambda row: cleaner(row), axis=1)

NameError: name 'josh' is not defined

In [43]:
len(df)

10000

In [47]:
df = df.apply(lambda row: cleaner(row),axis=1)

KeyError: ('UCR_code', 'occurred at index location_type')

In [30]:
!pip install pydotplus

Collecting pydotplus
[?25l  Downloading https://files.pythonhosted.org/packages/60/bf/62567830b700d9f6930e9ab6831d6ba256f7b0b730acb37278b0ccdffacf/pydotplus-2.0.2.tar.gz (278kB)
[K     |████████████████████████████████| 286kB 5.0MB/s eta 0:00:01
Building wheels for collected packages: pydotplus
  Building wheel for pydotplus (setup.py) ... [?25ldone
[?25h  Created wheel for pydotplus: filename=pydotplus-2.0.2-cp37-none-any.whl size=24567 sha256=3288f881c47c3da551c5b9a0127a7089efe81092090a0c8ced01f836a3fefa2c
  Stored in directory: /Users/mark/Library/Caches/pip/wheels/35/7b/ab/66fb7b2ac1f6df87475b09dc48e707b6e0de80a6d8444e3628
Successfully built pydotplus
Installing collected packages: pydotplus
Successfully installed pydotplus-2.0.2


In [32]:
clf = tree_clf

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)

# Train
model.fit(iris.data, iris.target)
# Extract single tree
estimator = model.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = iris.feature_names,
                class_names = iris.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

In [36]:
import pydotplus
import six
from sklearn import tree
dotfile = six.StringIO()
i_tree = 0
for tree_in_forest in clf.estimators_:
    if (i_tree <1):
        tree.export_graphviz(tree_in_forest, out_file=dotfile)
        pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png('dtree'+ str(i_tree) +'.png')
        i_tree = i_tree + 1

AttributeError: 'DecisionTreeClassifier' object has no attribute 'estimators_'

In [38]:
!pip install graphviz

Collecting graphviz
  Downloading https://files.pythonhosted.org/packages/94/cd/7b37f2b658995033879719e1ea4c9f171bf7a14c16b79220bd19f9eda3fe/graphviz-0.13-py2.py3-none-any.whl
Installing collected packages: graphviz
Successfully installed graphviz-0.13


In [39]:
from sklearn.tree import export_graphviz
import graphviz

export_graphviz(tree_clf, out_file="mytree.dot")
with open("mytree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

ExecutableNotFound: failed to execute ['dot', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

<graphviz.files.Source at 0x7f8981c8a690>

ImportError: cannot import name 'convert_to_graphviz' from 'sklearn.tree' (/Users/mark/opt/anaconda3/lib/python3.7/site-packages/sklearn/tree/__init__.py)

In [41]:
from sklearn.datasets import load_iris
iris = load_iris()

# Model (can also use single decision tree)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)

# Train
model.fit(iris.data, iris.target)
# Extract single tree
estimator = model.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = iris.feature_names,
                class_names = iris.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

FileNotFoundError: [Errno 2] No such file or directory: 'dot': 'dot'