In [23]:
import pickle

import humanize
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle


%matplotlib inline
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Oversampled
timestamp = '20180509221611'
final_features = pd.read_pickle('./final_features-{}.pkl'.format(timestamp))
outcomes = pd.read_pickle('./labels-{}.pkl'.format(timestamp))

In [3]:
# Non-oversamled
# timestamp = '20180509221611'
# final_features = pd.read_pickle('./final_features-{}-non_oversampled.pkl'.format(timestamp))
# outcomes = pd.read_pickle('./labels-{}-non_oversampled.pkl'.format(timestamp))

In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
encoded_outcomes = le.fit_transform(outcomes)

In [5]:
# Take out 5% of data for final final testing; shuffle first

final_features['stop_outcome'] = encoded_outcomes
final_features = shuffle(final_features, random_state=0)
outcomes = final_features.pop('stop_outcome')

lop_off_pct = .05
lop_off_idx = round(final_features.shape[0] * lop_off_pct)
print('lop_off_idx = {}'.format(lop_off_idx))

final_test_features = final_features[:lop_off_idx]
final_test_outcomes = outcomes[:lop_off_idx]

final_features = final_features[lop_off_idx:]
outcomes = outcomes[lop_off_idx:]

lop_off_idx = 50452


In [6]:
from sklearn.model_selection import train_test_split

# # Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_features, 
                                                    outcomes, 
                                                    test_size=0.2, 
                                                    random_state=0)


# GridSearchCV

## DecisionTreeClassifier

In [27]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier


params_dtc = {
#     'class_weight': None,
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 1, 2, 3, 5, 10, 19],
    'max_features': [None, 44, 'sqrt', 'log2'],
    'min_impurity_split': [0.0000001],
    'min_samples_split': [2, 3, 4], 
    'min_samples_leaf':[1, 2, 3],
    'min_weight_fraction_leaf': [0],
    'max_leaf_nodes': [None, 100, 1000, 2000],
    'random_state': [0],
}

dtc = DecisionTreeClassifier()

clf = GridSearchCV(dtc, params_dtc, scoring='accuracy', n_jobs=8, cv=5, verbose=3)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))


Fitting 5 folds for each of 4032 candidates, totalling 20160 fits


[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:   31.0s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:  3.1min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  7.7min
[Parallel(n_jobs=8)]: Done 496 tasks      | elapsed: 14.4min
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed: 22.0min
[Parallel(n_jobs=8)]: Done 1136 tasks      | elapsed: 24.9min
[Parallel(n_jobs=8)]: Done 1552 tasks      | elapsed: 27.7min
[Parallel(n_jobs=8)]: Done 2032 tasks      | elapsed: 30.7min
[Parallel(n_jobs=8)]: Done 2576 tasks      | elapsed: 33.1min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed: 36.8min
[Parallel(n_jobs=8)]: Done 3856 tasks      | elapsed: 41.3min
[Parallel(n_jobs=8)]: Done 4592 tasks      | elapsed: 45.9min
[Parallel(n_jobs=8)]: Done 5392 tasks      | elapsed: 52.1min
[Parallel(n_jobs=8)]: Done 6256 tasks      | elapsed: 60.6min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed: 67.3min
[Parallel(n_jobs=8)]: Done 8176 tasks      | elapsed: 85.2min
[Parallel(n_j

0.9197423325683288


In [28]:
print(clf.score(final_test_features, final_test_outcomes))

0.9221636406881789


In [30]:
clf.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0, presort=False,
            random_state=0, splitter='random')

In [31]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0,
 'random_state': 0,
 'splitter': 'random'}

In [33]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_impurity_split,param_min_samples_leaf,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,10.752858,0.214371,0.905006,0.978539,gini,,,,1e-07,1,...,0.904274,0.978629,0.904808,0.978585,0.905114,0.978634,0.495344,0.013856,0.000513,0.000106
1,11.725743,0.223996,0.905723,0.978539,gini,,,,1e-07,1,...,0.905194,0.978629,0.905923,0.978585,0.905766,0.978634,0.212399,0.018538,0.000709,0.000106
2,11.766043,0.227406,0.899003,0.977533,gini,,,,1e-07,1,...,0.898622,0.977589,0.898940,0.977607,0.898587,0.977568,0.314724,0.034649,0.000604,0.000074
3,12.103046,0.224197,0.897858,0.977241,gini,,,,1e-07,1,...,0.897024,0.977302,0.898431,0.977311,0.897009,0.977332,0.338296,0.012850,0.000697,0.000110
4,12.382997,0.218582,0.895721,0.974262,gini,,,,1e-07,1,...,0.894892,0.974295,0.896156,0.974367,0.895875,0.974254,0.709263,0.017063,0.000654,0.000066
5,11.710901,0.221193,0.894421,0.973039,gini,,,,1e-07,1,...,0.892786,0.973040,0.895034,0.973086,0.893801,0.973092,0.170381,0.012044,0.000978,0.000052
6,11.644824,0.210864,0.866024,0.944396,gini,,,,1e-07,2,...,0.865494,0.944518,0.866372,0.944507,0.866228,0.944670,0.305626,0.007916,0.000677,0.000222
7,12.035771,0.213270,0.837126,0.908752,gini,,,,1e-07,2,...,0.837739,0.909593,0.835161,0.908382,0.835610,0.907265,0.242986,0.009650,0.001623,0.000975
8,11.513276,0.210764,0.866024,0.944396,gini,,,,1e-07,2,...,0.865494,0.944518,0.866372,0.944507,0.866228,0.944670,0.194951,0.007242,0.000677,0.000222
9,12.040976,0.230718,0.837126,0.908752,gini,,,,1e-07,2,...,0.837739,0.909593,0.835161,0.908382,0.835610,0.907265,0.377567,0.010001,0.001623,0.000975


In [34]:
clf.best_index_

217

In [38]:
cv_results.iloc[217]

mean_fit_time                                                               3.71931
mean_score_time                                                            0.241844
mean_test_score                                                            0.907198
mean_train_score                                                           0.978539
param_criterion                                                                gini
param_max_depth                                                                None
param_max_features                                                             log2
param_max_leaf_nodes                                                           None
param_min_impurity_split                                                      1e-07
param_min_samples_leaf                                                            1
param_min_samples_split                                                           2
param_min_weight_fraction_leaf                                              

## RandomForestClassifier

In [27]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier


params_rfc = {
#     'class_weight': None,
    'n_estimators': [10, 100, 1000],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 1, 2, 3, 5, 10, 19],
    'max_features': [None, 44, 'sqrt', 'log2'],
    'min_impurity_split': [0.0000001],
    'min_samples_split': [2, 3, 4], 
    'min_samples_leaf':[1, 2, 3],
    'min_weight_fraction_leaf': [0],
    'max_leaf_nodes': [None, 100, 1000, 2000],
    'random_state': [0],
}

rfc = RandomForestClassifier()

clf = GridSearchCV(rfc, params_rfc, scoring='accuracy', n_jobs=8, cv=5, verbose=3)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))


Fitting 5 folds for each of 4032 candidates, totalling 20160 fits


[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:   31.0s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:  3.1min
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:  7.7min
[Parallel(n_jobs=8)]: Done 496 tasks      | elapsed: 14.4min
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed: 22.0min
[Parallel(n_jobs=8)]: Done 1136 tasks      | elapsed: 24.9min
[Parallel(n_jobs=8)]: Done 1552 tasks      | elapsed: 27.7min
[Parallel(n_jobs=8)]: Done 2032 tasks      | elapsed: 30.7min
[Parallel(n_jobs=8)]: Done 2576 tasks      | elapsed: 33.1min
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed: 36.8min
[Parallel(n_jobs=8)]: Done 3856 tasks      | elapsed: 41.3min
[Parallel(n_jobs=8)]: Done 4592 tasks      | elapsed: 45.9min
[Parallel(n_jobs=8)]: Done 5392 tasks      | elapsed: 52.1min
[Parallel(n_jobs=8)]: Done 6256 tasks      | elapsed: 60.6min
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed: 67.3min
[Parallel(n_jobs=8)]: Done 8176 tasks      | elapsed: 85.2min
[Parallel(n_j

0.9197423325683288


In [28]:
print(clf.score(final_test_features, final_test_outcomes))

0.9221636406881789


In [30]:
clf.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0, presort=False,
            random_state=0, splitter='random')

In [31]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0,
 'random_state': 0,
 'splitter': 'random'}

In [33]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_criterion,param_max_depth,param_max_features,param_max_leaf_nodes,param_min_impurity_split,param_min_samples_leaf,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,10.752858,0.214371,0.905006,0.978539,gini,,,,1e-07,1,...,0.904274,0.978629,0.904808,0.978585,0.905114,0.978634,0.495344,0.013856,0.000513,0.000106
1,11.725743,0.223996,0.905723,0.978539,gini,,,,1e-07,1,...,0.905194,0.978629,0.905923,0.978585,0.905766,0.978634,0.212399,0.018538,0.000709,0.000106
2,11.766043,0.227406,0.899003,0.977533,gini,,,,1e-07,1,...,0.898622,0.977589,0.898940,0.977607,0.898587,0.977568,0.314724,0.034649,0.000604,0.000074
3,12.103046,0.224197,0.897858,0.977241,gini,,,,1e-07,1,...,0.897024,0.977302,0.898431,0.977311,0.897009,0.977332,0.338296,0.012850,0.000697,0.000110
4,12.382997,0.218582,0.895721,0.974262,gini,,,,1e-07,1,...,0.894892,0.974295,0.896156,0.974367,0.895875,0.974254,0.709263,0.017063,0.000654,0.000066
5,11.710901,0.221193,0.894421,0.973039,gini,,,,1e-07,1,...,0.892786,0.973040,0.895034,0.973086,0.893801,0.973092,0.170381,0.012044,0.000978,0.000052
6,11.644824,0.210864,0.866024,0.944396,gini,,,,1e-07,2,...,0.865494,0.944518,0.866372,0.944507,0.866228,0.944670,0.305626,0.007916,0.000677,0.000222
7,12.035771,0.213270,0.837126,0.908752,gini,,,,1e-07,2,...,0.837739,0.909593,0.835161,0.908382,0.835610,0.907265,0.242986,0.009650,0.001623,0.000975
8,11.513276,0.210764,0.866024,0.944396,gini,,,,1e-07,2,...,0.865494,0.944518,0.866372,0.944507,0.866228,0.944670,0.194951,0.007242,0.000677,0.000222
9,12.040976,0.230718,0.837126,0.908752,gini,,,,1e-07,2,...,0.837739,0.909593,0.835161,0.908382,0.835610,0.907265,0.377567,0.010001,0.001623,0.000975


In [34]:
clf.best_index_

217

# Simple Linear Classifier

In [None]:
from sklearn import linear_model

clf_sgd = linear_model.SGDClassifier()
clf_sgd.fit(X_train, y_train)
clf_sgd.score(X_test, y_test)

# VotingClassifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

clf1 = RandomForestClassifier(n_jobs=8, verbose=3, random_state=0)
clf2 = GaussianNB()
clf3 = DecisionTreeClassifier(random_state=0)
clf4 = GradientBoostingClassifier(verbose=3, random_state=0)

eclf = VotingClassifier(estimators=[
        ('rf', clf1), ('gnb', clf2), ('dt', clf3), ('gb', clf4)],
        voting='soft')
eclf = eclf.fit(X_train, y_train)
print('eclf score: {}'.format(eclf.score(X_test, y_test)))

In [None]:
eclf.estimators_

# GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(random_state=0)
gbc.fit(X_train, y_train)
print('{}'.format(gbc.score(X_test, y_test)))

## GradientBoostingClassifier (Tuned)

In [None]:
gbc_tuned = GradientBoostingClassifier(
    learning_rate=0.0983,
    max_depth=6,
    max_features=len(list(final_features.columns.values)),
    subsample=0.9,
    verbose=3,
    random_state=0,
)
gbc_tuned.fit(X_train, y_train)
print('{}'.format(gbc_tuned.score(X_test, y_test)))

# DecisionTreeClassifier

In [18]:
dtc = DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)
print('{}'.format(dtc.score(X_test, y_test)))


0.9171865220112664


# RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)
print('{}'.format(rfc.score(X_test, y_test)))

# GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('{}'.format(gnb.score(X_test, y_test)))