In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

data = pd.read_hdf('..//data//model//data_model_clus.h5')

In [2]:
data.shape

(267363, 145)

In [3]:
X = data[[
    
 'Tenderloin',
 'South of Market',
 'Downtown',
 'Inner Mission',
 'Van Ness/Civic Center',
 'Duboce Triangle',
 'Haight Ashbury',
 'Miraloma Park',
 'Financial District/Barbary Coast',
 'Silver Terrace', 
    
 'winter',
 'spring',
 'summer',
 'fall',
 'hour_0',
 #'hour_6',
 'hour_12',
 #'hour_18',
 'dow_0',
 'dow_1',
 'dow_2',
 'dow_3',
 'dow_4',
 'dow_5',
 'dow_6',
 'UNIVPROX',
 'SIGNALIZED',
 'PKGMETERS',
 'MAXPCTSLPE',
 'MODEL6_VOL',
 'HH_PEDMODE',
 'PCOL_04_09',
 'PCOL_RATE',
 'HH_income',
 'total_pop',
 'unemp_20_24',
 'unemp_25_44',
 'unemp_45_54',
 'unemp_55_64',
 'white',
 'black',
 'asian',
 'other'
]]

y = data['poop']

X_imp = Imputer().fit_transform(X)
X_scale = StandardScaler().fit_transform(X_imp)

In [6]:
scoring = {'AUC': 'roc_auc',
          'F1': make_scorer(f1_score)
          'Accuracy': make_scorer(accuracy_score),
          'Precision': make_scorer(precision_score),
          'Recall': make_scorer(recall_score)
          }

# random forest
gs_rf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                     param_grid = {'criterion':['gini', 'entropy'],
                                   'max_depth':range(2,10,2),
                                   'n_estimators':range(60,120,15),
                                   'class_weight':[{1 : 4, 0 : 1},
                                                   {1 : 5, 0 : 1},
                                                   {1 : 6, 0 : 1},
                                                   {1 : 7, 0 : 1}]
                                   }, 
                     scoring=scoring,
                     cv=StratifiedKFold(),
                     refit='AUC')
gs_rf.fit(X_scale, y)
rf_results = gs_rf.cv_results_

In [7]:
print('\nRandom Forest\n\nBest Params')
print(gs_rf.best_params_)
print('Recall')
print(np.max(rf_results['mean_test_Recall']))
print('Precision')
print(np.max(rf_results['mean_test_Precision']))


Random Forest

Best Params
{'class_weight': {1: 5, 0: 1}, 'criterion': 'gini', 'max_depth': 4, 'n_estimators': 75}
Recall
0.7124995277410379
Precision
0.7928304884659098


In [16]:
pd.DataFrame({'importance':np.round(gs_rf.best_estimator_.feature_importances_,3),
              'feature':X.columns}).sort_values('importance', ascending = False)

Unnamed: 0,feature,importance
34,unemp_25_44,0.129
31,HH_income,0.126
0,Tenderloin,0.085
1,South of Market,0.079
35,unemp_45_54,0.061
40,other,0.059
30,PCOL_RATE,0.045
32,total_pop,0.043
37,white,0.042
39,asian,0.037
