In [5]:
import warnings
import pandas as pd
import numpy as np
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

data = pd.read_csv('..//data//model//data_to_model.csv')

In [6]:
X = data[[
    
 'winter',
 'spring',
 'summer',
 'fall',
 'hour_0',
 'hour_6',
 'hour_12',
 'hour_18',
 
 'UNIVPROX',
 'SIGNALIZED',
 'PKGMETERS',
 'MAXPCTSLPE',
 'MODEL6_VOL',
 'HH_PEDMODE',
 'PCOL_04_09',
 'PCOL_RATE',
 'HH_income'
]]

y = data['poop']

X_imp = Imputer().fit_transform(X)
X_scale = StandardScaler().fit_transform(X_imp)

In [23]:
scoring = {'AUC': 'roc_auc',
          'F1': make_scorer(f1_score)
          'Accuracy': make_scorer(accuracy_score),
          'Precision': make_scorer(precision_score),
          'Recall': make_scorer(recall_score)
          }


# Naive Bayes
gs_nb = GridSearchCV(BernoulliNB(),
                     param_grid = {'alpha':[0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]}, 
                     scoring=scoring,
                     cv=StratifiedKFold()
                     refit='AUC')
gs_nb.fit(X_scale, y)
nb_results = gs_nb.cv_results_

# logistic regression
gs_lr = GridSearchCV(LogisticRegression(),
                     param_grid = {'penalty':['l1','l2'],
                                   'C':[0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]}, 
                     scoring=scoring,
                     cv=StratifiedKFold()
                     refit='AUC')
gs_lr.fit(X_scale, y)
lr_results = gs_lr.cv_results_

# K nearest neighbors
gs_knn = GridSearchCV(KNeighborsClassifier(),
                     param_grid = {'n_neighbors':range(2,20,2),
                                   'weights':['uniform','distance'],
                                   'metric':['minkowski','manhattan','euclidean']}, 
                     scoring=scoring,
                     cv=StratifiedKFold()
                     refit='AUC')
gs_knn.fit(X_scale, y)
knn_results = gs_knn.cv_results_

# support vector machine
gs_svm = GridSearchCV(SVC(class_weight = 'balanced'),
                     param_grid = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
                                   'degree':[1,2,3],
                                   'C':[0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000],
                                   #'gamma':[0.00001,0.0001,0.001,0.01,0.1,1,10,100,1000]
                                  }, 
                     scoring=scoring,
                     cv=StratifiedKFold()
                     refit='AUC')
gs_svm.fit(X_scale, y)
svm_results = gs_svm.cv_results_

# decision trees
gs_dt = GridSearchCV(DecisionTreeClassifier(random_state=42),
                     param_grid={'criterion': ['gini', 'entropy'],
                                 'max_depth': range(2,8,2)},
                     scoring=scoring,
                     cv=StratifiedKFold(),
                     refit='AUC')
gs_dt.fit(X_scale, y)
dt_results = gs_dt.cv_results_

# gradient boosted trees
gs_gbt = GridSearchCV(GradientBoostingClassifier(),
                     param_grid = {'loss':['deviance', 'exponential'],
                                   'learning_rate':[0.00001,0.0001,0.001,0.01,0.1,1,10],
                                   'max_depth':range(4,10,2),
                                   'n_estimators':range(60,120,15),
                                   'max_features':range(5,20,5)
                                   }, 
                     scoring=scoring,
                     cv=StratifiedKFold()
                     refit='AUC')
gs_gbt.fit(X_scale, y)
gbt_results = gs_gbt.cv_results_

# random forest
gs_rf = GridSearchCV(RandomForestClassifier(class_weight='balanced', n_jobs=-1),
                     param_grid = {'criterion':['gini', 'entropy'],
                                   'max_depth':range(4,10,2),
                                   'n_estimators':range(60,120,15)
                                   }, 
                     scoring=scoring,
                     cv=StratifiedKFold()
                     refit='AUC')
gs_rf.fit(X_scale, y)
rf_results = gs_rf.cv_results_

KeyboardInterrupt: 

In [24]:
print('\nNaive Bayes\n\nBest Params')
print(gs_nb.best_params_)
print('Recall')
print(np.max(nb_results['mean_test_Recall']))
print('Precision')
print(np.max(nb_results['mean_test_Precision']))

print('\nLogisticRegression\n\nBest Params')
print(gs_lr.best_params_)
print('Recall')
print(np.max(lr_results['mean_test_Recall']))
print('Precision')
print(np.max(lr_results['mean_test_Precision']))

print('\nK Nearest Neighbors\n\nBest Params')
print(gs_knn.best_params_)
print('Recall')
print(np.max(knn_results['mean_test_Recall']))
print('Precision')
print(np.max(knn_results['mean_test_Precision']))

print('\nSupport Vector Machines\n\nBest Params')
print(gs_svm.best_params_)
print('Recall')
print(np.max(svm_results['mean_test_Recall']))
print('Precision')
print(np.max(svm_results['mean_test_Precision']))

print('Decision Tree\n\nBest Params')
print(gs_dt.best_params_)
print('Recall')
print(np.max(dt_results['mean_test_Recall']))
print('Precision')
print(np.max(dt_results['mean_test_Precision']))

print('\nGradient Boosted Trees\n\nBest Params')
print(gs_gbt.best_params_)
print('Recall')
print(np.max(gbt_results['mean_test_Recall']))
print('Precision')
print(np.max(gbt_results['mean_test_Precision']))

print('\nRandom Forest\n\nBest Params')
print(gs_rf.best_params_)
print('Recall')
print(np.max(rf_results['mean_test_Recall']))
print('Precision')
print(np.max(rf_results['mean_test_Precision']))


Naive Bayes

Best Params
{'alpha': 1e-05}
Recall
0.05137260499171376
Precision
0.08721239277562796

LogisticRegression

Best Params
{'C': 1000, 'penalty': 'l2'}
Recall
0.09413335240719553
Precision
0.09241454766057752

K Nearest Neighbors

Best Params
{'metric': 'minkowski', 'n_neighbors': 18, 'weights': 'uniform'}
Recall
0.21764597715327863
Precision
0.0690505968888187

Support Vector Machines

Best Params


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'