In [40]:
import warnings
import pandas as pd
import numpy as np
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

data = pd.read_hdf('..//data//model//data_model_clus.h5')

In [41]:
X = data[[
    
 'Tenderloin',
 'South of Market',
 'Downtown',
 'Inner Mission',
 'Van Ness/Civic Center',
 'Duboce Triangle',
 'Haight Ashbury',
 'Miraloma Park',
 'Financial District/Barbary Coast',
 'Silver Terrace', 
    
 'winter',
 'spring',
 'summer',
 'fall',
 'hour_0',
 #'hour_6',
 'hour_12',
 #'hour_18',
 'dow_0',
 'dow_1',
 'dow_2',
 'dow_3',
 'dow_4',
 'dow_5',
 'dow_6',
 'UNIVPROX',
 'SIGNALIZED',
 'PKGMETERS',
 'MAXPCTSLPE',
 'MODEL6_VOL',
 'HH_PEDMODE',
 'PCOL_04_09',
 'PCOL_RATE',
 'HH_income',
 'total_pop',
 'unemp_20_24',
 'unemp_25_44',
 'unemp_45_54',
 'unemp_55_64',
 'white',
 'black',
 'asian',
 'other'
]]

y = data['poop']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

X_train_imp = Imputer().fit_transform(X_train)
X_train_scale = StandardScaler().fit_transform(X_train_imp)

X_test_imp = Imputer().fit_transform(X_test)
X_test_scale = StandardScaler().fit_transform(X_test_imp)

In [42]:
small_rf = RandomForestClassifier(n_jobs=-1,
                                  class_weight={1: 5, 0: 1},
                                  criterion='gini', 
                                  max_depth=4, 
                                  n_estimators=100)
small_rf = small_rf.fit(X_train_scale, y_train)

In [43]:
recall_score(y_test, small_rf.predict(X_test_scale))

0.17271548930654568

In [44]:
recall_score(y_train, small_rf.predict(X_train_scale))

0.16837457977236825

In [47]:
scoring = {'AUC': 'roc_auc',
          'F1': make_scorer(f1_score),
          'Accuracy': make_scorer(accuracy_score),
          'Precision': make_scorer(precision_score),
          'Recall': make_scorer(recall_score)
          }

# random forest
gs_rf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                     param_grid = {'criterion':['gini', 'entropy'],
                                   'max_depth':range(2,10,2),
                                   'n_estimators':range(60,120,15),
                                   'class_weight':[{1 : 4, 0 : 1},
                                                   {1 : 5, 0 : 1},
                                                   {1 : 6, 0 : 1},
                                                   {1 : 7, 0 : 1}]
                                   }, 
                     scoring=scoring,
                     cv=StratifiedKFold(),
                     refit='AUC')
gs_rf.fit(X_train_scale, y_train)
rf_results = gs_rf.cv_results_

In [48]:
print('\nRandom Forest\n\nBest Params')
print(gs_rf.best_params_)
print('Recall')
print(np.max(rf_results['mean_test_Recall']))
print('Precision')
print(np.max(rf_results['mean_test_Precision']))


Random Forest

Best Params
{'class_weight': {1: 4, 0: 1}, 'criterion': 'gini', 'max_depth': 8, 'n_estimators': 75}
Recall
0.46721185805056037
Precision
0.5287001065893174


In [49]:
recall_score(y_test, gs_rf.best_estimator_.predict(X_test_scale))

0.3324692158133506

In [50]:
precision_score(y_test, gs_rf.best_estimator_.predict(X_test_scale))

0.31853461657870225

In [51]:
pd.DataFrame({'importance':np.round(gs_rf.best_estimator_.feature_importances_,3),
              'feature':X.columns}).sort_values('importance', ascending = False)

Unnamed: 0,feature,importance
31,HH_income,0.113
0,Tenderloin,0.092
34,unemp_25_44,0.092
40,other,0.058
35,unemp_45_54,0.056
37,white,0.052
1,South of Market,0.049
28,HH_PEDMODE,0.043
32,total_pop,0.042
38,black,0.042


In [53]:
X.shape

(366711, 41)