### Stacking Model Development

- Objective: develop and compare stacking models using best candidates from Nested CV notebook 
- Methodology: mlxtend module

We will use the augmented dataset (which does not include the duration field)

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

In [19]:
from mlxtend.classifier import StackingCVClassifier
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [6]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [7]:
#Loading df1 after it has been augmented in iteration 1:
df = pd.read_pickle('../data/pickle_files/df_pickle')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration'])

In [8]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64']).columns

y = df['y']
X = df.drop(columns=['y'])

In [9]:
scaler = preprocessing.StandardScaler().fit(X)
X_transformed = scaler.transform(X)

In [10]:
#X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X_transformed), y, random_state = 4)

In [11]:
#will work with numpy arrays
y = np.array(y)
X = np.array(X_transformed)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 4)

Most Promising Models from Iteration 1 - Model Comparison Notebook:
    - Decision Tree Classifier
    - K Neighbors Classifier
    - BernoulliNB
    - Logistic Regression

We will implement a stacking algorithm that uses all three and makes a prediction based on their predictions

In [98]:
def gridSearch_clf(clf, param_grid, X_train, y_train):
    gs = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    print("Best Parameters")
    print(gs.best_params_)
    return gs.best_estimator_

In [99]:
def gs_report(y_test, X_test, best_estimator):
    print(classification_report(y_test, best_estimator.predict(X_test)))
    print("Overall Accuracy Score: ")
    print(accuracy_score(y_test, best_estimator.predict(X_test)))

In [100]:
clf1 = DecisionTreeClassifier()
param_grid = {'max_depth':[3,5,7],
              'min_samples_split':[100, 1000],
              'max_features': [4,6,8],
              'max_leaf_nodes':[10,20],
              'class_weight': ['balanced']}

In [101]:
#Saving best estimator for clf1
best_clf1 = gridSearch_clf(clf1, param_grid, X_train, y_train)
gs_report(y_test,X_test, gridSearch_clf(clf1, param_grid, X_train, y_train))

Best Parameters
{'class_weight': 'balanced', 'max_depth': 7, 'max_features': 8, 'max_leaf_nodes': 20, 'min_samples_split': 1000}
Best Parameters
{'class_weight': 'balanced', 'max_depth': 7, 'max_features': 8, 'max_leaf_nodes': 20, 'min_samples_split': 100}
             precision    recall  f1-score   support

          0       0.95      0.85      0.90      9131
          1       0.35      0.62      0.45      1166

avg / total       0.88      0.83      0.85     10297

Overall Accuracy Score: 
0.8252889191026512


In [102]:
clf2 = KNeighborsClassifier()

In [103]:
param_grid = {'n_neighbors':[3,5,7,9],
              'algorithm' : ['auto', 'ball_tree'],
              'weights': ['distance']}

In [104]:
#Saving Best Estimator for clf2
best_clf2 = gridSearch_clf(clf2, param_grid, X_train, y_train)
gs_report(y_test,X_test, gridSearch_clf(clf2, param_grid, X_train, y_train))

Best Parameters
{'algorithm': 'auto', 'n_neighbors': 9, 'weights': 'distance'}
Best Parameters
{'algorithm': 'auto', 'n_neighbors': 9, 'weights': 'distance'}
             precision    recall  f1-score   support

          0       0.91      0.97      0.94      9131
          1       0.51      0.27      0.36      1166

avg / total       0.87      0.89      0.87     10297

Overall Accuracy Score: 
0.8876371758764688


In [105]:
clf3 = BernoulliNB()
param_grid = {'alpha':np.logspace(-2, 3, num=6, base=10)}

In [106]:
best_clf3 = gridSearch_clf(clf3, param_grid, X_train, y_train)
gs_report(y_test,X_test, gridSearch_clf(clf3, param_grid, X_train, y_train))

Best Parameters
{'alpha': 1000.0}
Best Parameters
{'alpha': 1000.0}
             precision    recall  f1-score   support

          0       0.92      0.90      0.91      9131
          1       0.31      0.35      0.33      1166

avg / total       0.85      0.84      0.84     10297

Overall Accuracy Score: 
0.835971642225891


In [107]:
lr = LogisticRegression()

In [108]:
np.random.seed(3)
sclf = StackingCVClassifier(classifiers=[best_clf1, best_clf2, best_clf3], 
                            meta_classifier=lr)

In [109]:
print('3-fold cross validation:\n')

for clf, label in zip([best_clf1, best_clf2, best_clf3, sclf], 
                      ['Decision Tree Classifier', 
                       'K Neighbors Classifier', 
                       'Bernoulli Naive Bayes',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')
    
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

3-fold cross validation:

Accuracy: 0.37 (+/- 0.37) [Decision Tree Classifier]
Accuracy: 0.48 (+/- 0.29) [K Neighbors Classifier]
Accuracy: 0.67 (+/- 0.32) [Bernoulli Naive Bayes]
Accuracy: 0.48 (+/- 0.29) [StackingClassifier]


In [110]:
clfs = [best_clf1, best_clf2, best_clf3, sclf]
clf_names = [i.__class__.__name__ for i in clfs]

In [111]:
def print_cv(clfs, clf_names):
    
    print('3-fold cross validation:\n')

    for clf, label in zip([best_clf1, best_clf2, best_clf3, sclf], 
                          ['Decision Tree Classifier', 
                           'K Neighbors Classifier', 
                           'Bernoulli Naive Bayes',
                           'StackingClassifier']):

        scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')

        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [112]:
print_cv(clfs, clf_names)

3-fold cross validation:

Accuracy: 0.34 (+/- 0.39) [Decision Tree Classifier]
Accuracy: 0.48 (+/- 0.29) [K Neighbors Classifier]
Accuracy: 0.67 (+/- 0.32) [Bernoulli Naive Bayes]
Accuracy: 0.48 (+/- 0.29) [StackingClassifier]


In [113]:
#Trying the same stacking classifier with class probabilities rather than class labels
np.random.seed(3)
sclf_proba = StackingCVClassifier(classifiers=[best_clf1, best_clf2, best_clf3],
                            use_probas = True,
                            meta_classifier=lr)

In [114]:
clfs = [best_clf1, best_clf2, best_clf3, sclf_proba]
clf_names = [i.__class__.__name__ for i in clfs]

In [115]:
print_cv(clfs, clf_names)

3-fold cross validation:

Accuracy: 0.37 (+/- 0.37) [Decision Tree Classifier]
Accuracy: 0.48 (+/- 0.29) [K Neighbors Classifier]
Accuracy: 0.67 (+/- 0.32) [Bernoulli Naive Bayes]
Accuracy: 0.48 (+/- 0.29) [StackingClassifier]


In [None]:
# Initializing models

clf1 = DecisionTreeClassifier()
clf2 = KNeighborsClassifier()
clf3 = BernoulliNB()
lr = LogisticRegression()

# The StackingCVClassifier uses scikit-learn's check_cv
# internally, which doesn't support a random seed. Thus
# NumPy's random seed need to be specified explicitely for
# deterministic behavior
np.random.seed(RANDOM_SEED)
sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], 
                            meta_classifier=lr)

params = {'decisiontreeclassifier__'
          'kneighborsclassifier__n_neighbors': [1, 5],
          'randomforestclassifier__n_estimators': [10, 50],
          'meta-logisticregression__C': [0.1, 10.0]}

grid = GridSearchCV(estimator=sclf, 
                    param_grid=params, 
                    cv=5,
                    refit=True)
grid.fit(X, y)

cv_keys = ('mean_test_score', 'std_test_score', 'params')

for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_[cv_keys[0]][r],
             grid.cv_results_[cv_keys[1]][r] / 2.0,
             grid.cv_results_[cv_keys[2]][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)