In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor
import xgboost as xgb

from sklearn.ensemble import RandomForestRegressor 


## Read processed dataset

In [13]:
df = pd.read_csv("processed_dataset.csv")
df.head()

Unnamed: 0,medIncome,pctWWage,perCapInc,whitePerCap,blackPerCap,indianPerCap,asianPerCap,otherPerCap,HispPerCap,pctPopUnderPov,...,Log_burglPerPop,Log_larcenies,Log_larcPerPop,Log_autoTheft,Log_autoTheftPerPop,Log_arsons,Log_arsonsPerPop,Log_ViolentCrimesPerPop,Log_nonViolPerPop,State
0,0.876345,1.600432,0.941602,1.15939,0.415994,-0.093158,0.039099,0.835051,0.56303,-0.420409,...,1.014704,1.96836,0.007233,2.586872,0.764607,2.145875,0.820506,1.989888,0.680567,AK
1,1.234794,1.437346,1.008933,1.308139,0.480702,-0.121228,-0.063869,0.771022,0.448013,-0.640405,...,-0.978673,-0.417671,-0.795302,0.100167,0.114909,-0.161035,-0.297702,-0.054134,0.062051,AK
2,-0.197109,1.260783,-0.170482,0.098767,-0.17932,-0.301121,-0.512889,0.059712,-0.383929,0.073097,...,0.754678,0.180323,-0.009341,0.625712,0.792283,0.867172,1.107332,-0.999713,-1.577354,AK
3,-0.759653,-0.24607,-0.53362,-0.11151,-0.827913,0.02376,0.036641,0.965798,-0.207766,1.060108,...,1.536764,1.24767,1.088329,1.324566,1.185399,0.976763,0.713426,1.06448,0.492163,AL
4,-0.860033,-0.837759,-0.739429,-0.833392,-0.602023,-0.510427,1.127371,0.222923,-1.085397,0.687006,...,0.449002,-0.219521,0.602244,-0.377621,0.468178,-0.542571,-0.12526,1.06448,0.492163,AL


In [14]:
df_new = df.drop(['Log_NumInShelters', 'Log_NumStreet', 'Log_murders', 'Log_murdPerPop', 'State', 'Log_nonViolPerPop'], axis=1)
df_new.head()
df_new.dropna(inplace=True)

## Select Features and Label and set train test split

In [15]:
X = df_new.drop(['Log_ViolentCrimesPerPop'], axis=1)
y = df_new['Log_ViolentCrimesPerPop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 424)


## Import, initiate and fit random forest regressor

In [18]:
 # create regressor object 
    
from sklearn.ensemble import RandomForestRegressor     
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0) 
regressor.fit(X_train, y_train)

y_pred_rf = regressor.predict(X_test)
r2_score(y_test,y_pred_rf)


0.433019480438791

## Evaluate model with 10 folds cross validation

In [20]:
from sklearn.model_selection import cross_val_score
# clf = RandomForestRegressor()
 #Initialize with whatever parameters you want to

print(np.mean(cross_val_score(RandomForestRegressor(), X, y, cv=10)))

0.40581416724892927


## Random Search to tune random forest

In [10]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 8)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 7)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [50, 71, 92, 114, 135, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 9, 13, 17, 21, 25, 30, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
y_pred_best = rf_random.predict(X_test)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   32.6s finished


In [36]:
print('best features for random forest are:', rf_random.best_params_)
print("R2 score:", r2_score(y_test,y_pred_best))
print("Improvement from untuned random forest:", (r2_score(y_test,y_pred_best) - r2_score(y_test,y_pred_rf)) * 100,'%')



best features for random forest are: {'n_estimators': 92, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 13, 'bootstrap': True}
R2 score: 0.5153508939246441
Improvement from untuned random forest: 2.4588250455359284 %


In [21]:
 #Initialize with whatever parameters you want to

print(np.mean(cross_val_score(rf_random, X_train, y_train, cv=10)))

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   30.7s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   27.1s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   28.0s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   26.5s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   26.6s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   26.7s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   26.9s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   29.3s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   26.8s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   28.1s finished


0.453504155240001


## Initiate and test Bagging Regressor

In [38]:
from sklearn.ensemble import BaggingRegressor
bagging_regressor = BaggingRegressor() 
bagging_regressor.fit(X_train, y_train)

y_pred_bg = bagging_regressor.predict(X_test)
r2_score(y_test,y_pred_bg)


0.4824129010078564

In [30]:
print(np.mean(cross_val_score(BaggingRegressor(), X_train, y_train, cv=10)))

0.39284335458578196


## Initiate and test  XGBoost

In [41]:
# convert the dataset into an optimized data structure called Dmatrix that XGBoost supports
data_dmatrix = xgb.DMatrix(data=X,label=y)


In [65]:
import xgboost as xgb
# convert the dataset into an optimized data structure called Dmatrix that XGBoost supports
data_dmatrix = xgb.DMatrix(data=X,label=y)
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 3, alpha = 10, n_estimators = 200)
xg_reg.fit(X_train,y_train)
y_pred_xg = xg_reg.predict(X_test)
r2_score(y_test,y_pred_xg)




0.5071993194396187

In [26]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 3, alpha = 10, n_estimators = 200)

print(np.mean(cross_val_score(xg_reg, X_train, y_train, cv=10)))

0.43288165881676255


## Initiate and test LightGBM

In [27]:
from lightgbm import LGBMRegressor
LGBM_reg = LGBMRegressor()
LGBM_reg.fit(X_train,y_train)
y_pred_lgbm = LGBM_reg.predict(X_test)
r2_score(y_test,y_pred_lgbm)


0.43805073775107417

In [25]:
print(np.mean(cross_val_score(LGBMRegressor(), X_train, y_train, cv=10)))

0.39696509424337617


## Initiate and test AdaBoost

In [29]:
from sklearn.ensemble import AdaBoostRegressor
ada_rgr = AdaBoostRegressor()
ada_rgr.fit(X_train,y_train)
y_pred_ada = ada_rgr.predict(X_test)
r2_score(y_test,y_pred_ada)


0.39539677546759666

In [31]:
print(np.mean(cross_val_score(AdaBoostRegressor(), X_train, y_train, cv=10)))

0.35517734968924747
