### Stacking Model Development

- Objective: develop and compare stacking models using best candidates from Nested CV notebook 
- Methodology: mlxtend module

We will use the augmented dataset (which does not include the duration field)

In [17]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math

In [18]:
from mlxtend.classifier import StackingCVClassifier
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

In [19]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

RANDOM_SEED = 12

In [20]:
#Loading df1 after it has been augmented in iteration 1:
df = pd.read_pickle('../data/pickle_files/df_pickle_w_time_stats')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration'])

In [21]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64']).columns

y = df['y']
X = df.drop(columns=['y', 'Date'])

In [22]:
scaler = preprocessing.StandardScaler().fit(X)
X_transformed = scaler.transform(X)

In [23]:
#X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X_transformed), y, random_state = 4)

In [24]:
#will work with numpy arrays
y = np.array(y)
X = np.array(X_transformed)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 4)

Most Promising Models from Iteration 1 - Model Comparison Notebook:
    - Decision Tree Classifier
    - K Neighbors Classifier
    - BernoulliNB
    - Logistic Regression

We will implement a stacking algorithm that uses all three and makes a prediction based on their predictions

In [26]:
def gridSearch_clf(clf, param_grid, X_train, y_train):
    gs = GridSearchCV(clf, param_grid).fit(X_train, y_train)
    print("Best Parameters")
    print(gs.best_params_)
    return gs.best_estimator_

In [27]:
def gs_report(y_test, X_test, best_estimator):
    print(classification_report(y_test, best_estimator.predict(X_test)))
    print("Overall Accuracy Score: ")
    print(accuracy_score(y_test, best_estimator.predict(X_test)))

In [28]:
clf1 = DecisionTreeClassifier()
param_grid = {'max_depth':[5,7,9,11],
              'min_samples_split':[900, 1000, 3000],
              'max_features': [7,8,10,15],
              'max_leaf_nodes':[10,20],
              'class_weight': ['balanced']}

In [29]:
#Saving best estimator
best_clf1 = gridSearch_clf(clf1, param_grid, X_train, y_train)
gs_report(y_test,X_test, best_clf1)

Best Parameters
{'class_weight': 'balanced', 'max_depth': 7, 'max_features': 10, 'max_leaf_nodes': 10, 'min_samples_split': 1000}
             precision    recall  f1-score   support

          0       0.94      0.86      0.90      9131
          1       0.36      0.61      0.45      1166

avg / total       0.88      0.83      0.85     10297

Overall Accuracy Score: 
0.8325725939594056


In [30]:
clf2 = KNeighborsClassifier()

In [31]:
param_grid = {'n_neighbors':[5,7,9,15],
              'algorithm' : ['auto', 'ball_tree'],
              'weights': ['distance']}

In [32]:
#Saving Best Estimator
best_clf2 = gridSearch_clf(clf2, param_grid, X_train, y_train)
gs_report(y_test,X_test, best_clf2)

Best Parameters
{'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
             precision    recall  f1-score   support

          0       0.91      0.97      0.94      9131
          1       0.54      0.29      0.37      1166

avg / total       0.87      0.89      0.88     10297

Overall Accuracy Score: 
0.8912304554724677


In [33]:
clf3 = BernoulliNB()
param_grid = {'alpha':np.logspace(-2, 3, num=6, base=10)}

In [34]:
#Saving best estimator
best_clf3 = gridSearch_clf(clf3, param_grid, X_train, y_train)
gs_report(y_test,X_test, best_clf3)

Best Parameters
{'alpha': 1000.0}
             precision    recall  f1-score   support

          0       0.95      0.73      0.83      9131
          1       0.25      0.69      0.36      1166

avg / total       0.87      0.73      0.77     10297

Overall Accuracy Score: 
0.7268136350393318


In [35]:
def print_cv(clfs, clf_names):
    
    print('3-fold cross validation:\n')

    for clf, label in zip([best_clf1, best_clf2, best_clf3, sclf], 
                          ['Logistic Regression', 
                           'K Neighbors Classifier', 
                           'Bernoulli Naive Bayes',
                           'StackingClassifier']):

        scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy')

        print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

In [36]:
lr = LogisticRegression()

In [37]:
np.random.seed(3)
sclf = StackingCVClassifier(classifiers=[best_clf1, best_clf2, best_clf3], 
                            meta_classifier=lr)

In [38]:
clfs = [best_clf1, best_clf2, best_clf3, sclf]
clf_names = [i.__class__.__name__ for i in clfs]

In [39]:
print_cv(clfs, clf_names)

3-fold cross validation:

Accuracy: 0.34 (+/- 0.40) [Logistic Regression]
Accuracy: 0.45 (+/- 0.31) [K Neighbors Classifier]
Accuracy: 0.52 (+/- 0.32) [Bernoulli Naive Bayes]
Accuracy: 0.40 (+/- 0.34) [StackingClassifier]


In [40]:
param_grid = {'meta-logisticregression__C':np.logspace(-2, 3, num=6, base=10),
             'meta-logisticregression__class_weight':['balanced']}

In [41]:
#Saving Best Estimator
best_sclf = gridSearch_clf(sclf, param_grid, X_train, y_train)
gs_report(y_test,X_test, best_sclf)

Best Parameters
{'meta-logisticregression__C': 1.0, 'meta-logisticregression__class_weight': 'balanced'}
             precision    recall  f1-score   support

          0       0.95      0.82      0.88      9131
          1       0.31      0.65      0.42      1166

avg / total       0.88      0.80      0.83     10297

Overall Accuracy Score: 
0.7986792269593085


In [42]:
#Trying the same stacking classifier with class probabilities rather than class labels
np.random.seed(3)
sclf_proba = StackingCVClassifier(classifiers=[best_clf1, best_clf2, best_clf3],
                            use_probas = True,
                            meta_classifier=lr)

In [43]:
clfs = [best_clf1, best_clf2, best_clf3, sclf_proba]
clf_names = [i.__class__.__name__ for i in clfs]

In [44]:
print_cv(clfs, clf_names)

3-fold cross validation:

Accuracy: 0.34 (+/- 0.40) [Logistic Regression]
Accuracy: 0.45 (+/- 0.31) [K Neighbors Classifier]
Accuracy: 0.52 (+/- 0.32) [Bernoulli Naive Bayes]
Accuracy: 0.40 (+/- 0.34) [StackingClassifier]
