In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics


## Read processed dataset

In [6]:
df = pd.read_csv("processed_dataset.csv")
df.head()

Unnamed: 0,medIncome,pctWWage,perCapInc,whitePerCap,blackPerCap,indianPerCap,asianPerCap,otherPerCap,HispPerCap,pctPopUnderPov,...,Log_burglPerPop,Log_larcenies,Log_larcPerPop,Log_autoTheft,Log_autoTheftPerPop,Log_arsons,Log_arsonsPerPop,Log_ViolentCrimesPerPop,Log_nonViolPerPop,State
0,0.876345,1.600432,0.941602,1.15939,0.415994,-0.093158,0.039099,0.835051,0.56303,-0.420409,...,1.014704,1.96836,0.007233,2.586872,0.764607,2.145875,0.820506,1.989888,0.680567,AK
1,1.234794,1.437346,1.008933,1.308139,0.480702,-0.121228,-0.063869,0.771022,0.448013,-0.640405,...,-0.978673,-0.417671,-0.795302,0.100167,0.114909,-0.161035,-0.297702,-0.054134,0.062051,AK
2,-0.197109,1.260783,-0.170482,0.098767,-0.17932,-0.301121,-0.512889,0.059712,-0.383929,0.073097,...,0.754678,0.180323,-0.009341,0.625712,0.792283,0.867172,1.107332,-0.999713,-1.577354,AK
3,-0.759653,-0.24607,-0.53362,-0.11151,-0.827913,0.02376,0.036641,0.965798,-0.207766,1.060108,...,1.536764,1.24767,1.088329,1.324566,1.185399,0.976763,0.713426,1.06448,0.492163,AL
4,-0.860033,-0.837759,-0.739429,-0.833392,-0.602023,-0.510427,1.127371,0.222923,-1.085397,0.687006,...,0.449002,-0.219521,0.602244,-0.377621,0.468178,-0.542571,-0.12526,1.06448,0.492163,AL


In [7]:
df_new = df.drop(['Log_NumInShelters', 'Log_NumStreet', 'Log_murders', 'Log_murdPerPop', 'State', 'Log_nonViolPerPop'], axis=1)
df_new.head()
df_new.dropna(inplace=True)



## Create labels for classification

In [8]:
def high_or_low(rate):
    median_rate = df_new['Log_ViolentCrimesPerPop'].median()
    if rate<median_rate: 
        return 0
    else: 
        return 1


In [9]:
df_new['isViolentCrimePerPopHigh'] = df_new['Log_ViolentCrimesPerPop'].apply(high_or_low)

In [10]:
df_new

Unnamed: 0,medIncome,pctWWage,perCapInc,whitePerCap,blackPerCap,indianPerCap,asianPerCap,otherPerCap,HispPerCap,pctPopUnderPov,...,Log_burglaries,Log_burglPerPop,Log_larcenies,Log_larcPerPop,Log_autoTheft,Log_autoTheftPerPop,Log_arsons,Log_arsonsPerPop,Log_ViolentCrimesPerPop,isViolentCrimePerPopHigh
0,0.876345,1.600432,0.941602,1.159390,0.415994,-0.093158,0.039099,0.835051,0.563030,-0.420409,...,2.256699,1.014704,1.968360,0.007233,2.586872,0.764607,2.145875,0.820506,1.989888,1
1,1.234794,1.437346,1.008933,1.308139,0.480702,-0.121228,-0.063869,0.771022,0.448013,-0.640405,...,-0.706527,-0.978673,-0.417671,-0.795302,0.100167,0.114909,-0.161035,-0.297702,-0.054134,0
2,-0.197109,1.260783,-0.170482,0.098767,-0.179320,-0.301121,-0.512889,0.059712,-0.383929,0.073097,...,0.701692,0.754678,0.180323,-0.009341,0.625712,0.792283,0.867172,1.107332,-0.999713,0
3,-0.759653,-0.246070,-0.533620,-0.111510,-0.827913,0.023760,0.036641,0.965798,-0.207766,1.060108,...,1.623161,1.536764,1.247670,1.088329,1.324566,1.185399,0.976763,0.713426,1.064480,1
4,-0.860033,-0.837759,-0.739429,-0.833392,-0.602023,-0.510427,1.127371,0.222923,-1.085397,0.687006,...,-0.099189,0.449002,-0.219521,0.602244,-0.377621,0.468178,-0.542571,-0.125260,1.064480,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1585,-1.436723,-2.100658,-0.767483,-0.855929,-0.706228,-0.527513,1.455122,-0.643885,-0.570320,1.974283,...,0.769471,0.430378,1.195098,1.060460,1.056291,0.756143,0.449139,-0.058941,1.116249,1
1588,0.554841,1.268870,-0.566612,-0.687122,0.916683,0.178024,-0.911653,0.078186,-0.096613,-0.401085,...,-0.745365,-0.340897,-1.005855,-0.646983,-0.668912,0.073945,-1.266200,-1.154135,-0.308101,0
1589,0.118178,1.698822,-0.393572,-0.534317,-1.587601,-0.714485,-0.542659,0.935667,-0.681928,-0.030955,...,0.155411,0.561964,-0.928590,-1.031806,0.085088,0.874690,-0.732181,-0.704035,0.658741,1
1591,-1.313636,0.941352,-0.846709,-0.945629,-1.162882,-0.539840,-1.043026,0.072626,-0.243226,1.654693,...,0.176320,0.251005,-0.730440,-1.264087,0.066208,0.146189,-0.482484,-0.738659,-0.054134,0


## Select Features and Label and set train test split

In [11]:

X = df_new.drop(['isViolentCrimePerPopHigh','Log_ViolentCrimesPerPop'], axis=1)
y = df_new['isViolentCrimePerPopHigh']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 424)


## Initiate and test random forest classifier

In [29]:
 # create classifier
rf_classifier = RandomForestClassifier() 
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_rf))


Accuracy: 0.7346938775510204


In [30]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier()
 #Initialize with whatever parameters you want to

print(np.mean(cross_val_score(clf, X, y, cv=10)))

0.7272890843662536


## Initiate and test random forest tuned with random search CV

In [14]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 8)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 7)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [50, 71, 92, 114, 135, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 9, 13, 17, 21, 25, 30, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [28]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)
y_pred_best = rf_random.predict(X_test)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   19.5s finished


In [16]:
print('best features for random forest are:', rf_random.best_params_)
print("R2 score:", metrics.accuracy_score(y_test,y_pred_best))
print("Improvement from untuned random forest:", (metrics.accuracy_score(y_test,y_pred_best) - metrics.accuracy_score(y_test,y_pred_rf)) * 100,'%')


best features for random forest are: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 5, 'bootstrap': True}
R2 score: 0.7918367346938775
Improvement from untuned random forest: 0.40816326530611624 %


In [17]:
from sklearn.model_selection import cross_val_score

print(np.mean(cross_val_score(rf_random, X, y, cv=10)))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   15.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   16.1s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   15.9s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   13.5s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   11.9s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   11.9s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   15.7s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   15.5s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   15.4s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   15.6s finished


0.7444022391043583


## Initiate and test Bagging Classifier

In [19]:
from sklearn.ensemble import BaggingClassifier
bagging_classifier = BaggingClassifier() 
bagging_classifier.fit(X_train, y_train)
y_pred_bg = bagging_classifier.predict(X_test)
metrics.accuracy_score(y_test,y_pred_bg)


0.7428571428571429

In [31]:
print(np.mean(cross_val_score(BaggingClassifier(), X, y, cv=10)))

0.704411568705851


## Initiate and test XGBoost Classifier

In [20]:
data_dmatrix = xgb.DMatrix(data=X,label=y)


In [21]:
xg_classifier = xgb.XGBClassifier(learning_rate = 0.01,)
xg_classifier.fit(X_train,y_train)
y_pred_xg = xg_classifier.predict(X_test)
metrics.accuracy_score(y_test,y_pred_xg)



0.7510204081632653

In [22]:
print(np.mean(cross_val_score(xg_classifier, X, y, cv=10)))

0.7167199786751965


## Initiate and test LightGBM Classifier

In [23]:
from lightgbm import LGBMClassifier


In [24]:
LGBM_clf = LGBMClassifier()
LGBM_clf.fit(X_train,y_train)
y_pred_lgbm = LGBM_clf.predict(X_test)
metrics.accuracy_score(y_test,y_pred_lgbm)


0.7918367346938775

In [32]:
print(np.mean(cross_val_score(LGBMClassifier(), X, y, cv=10)))

0.6905371184859389


## Initiate and and test AdaBoost Classifier

In [33]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train,y_train)
y_pred_ada = ada_clf.predict(X_test)
metrics.accuracy_score(y_test,y_pred_ada)


0.6979591836734694

In [34]:
print(np.mean(cross_val_score(AdaBoostClassifier(), X, y, cv=10)))

0.6807676929228308
