### Spotchecking Models

- Objective: identify promising models
- Method: draft different models and compare them with Nested CV 
- We are not allowed to use Duration

In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
pd.options.display.float_format = '{:.2f}'.format

In [5]:
#Loading df1 after it has been augmented in iteration 2:
df = pd.read_pickle('../data/pickle_files/df_pickle_w_time_stats')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration'])

In [6]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64']).columns

y = df['y']
X = df.drop(columns=['y'])

In [10]:
#int8 dtypes are categorical features - let's make sure we don't standardize them
df.dtypes

age                 int64
campaign            int64
pdays               int64
previous            int64
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
job                  int8
marital              int8
education            int8
default              int8
housing              int8
loan                 int8
contact              int8
month                int8
day_of_week          int8
poutcome             int8
y                    int8
dtype: object

In [15]:
X_temp_noncat = df.select_dtypes(exclude=['int8'])
X_temp_cat = df.select_dtypes(include=['int8'])

In [16]:
scaler = preprocessing.StandardScaler().fit(X_temp_noncat)
X_transformed = scaler.transform(X_temp_noncat)

In [20]:
X = pd.concat([pd.DataFrame(X_transformed, columns =X_temp_noncat.columns), X_temp_cat], axis = 1)

In [21]:
X = df.drop(columns=['y'])

In [22]:
#will work with numpy arrays
y = np.array(y)
X = np.array(X)

In [23]:
#Changed in version 0.20: Parameter iid will change from True to False by default in version 0.22, and will be removed in 0.24.

In [24]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer, recall_score, precision_score

In [25]:
def report_withaccuracy(y_true, y_pred, print_report = False):

    #print(classification_report(y_true, y_pred)) # print classification report

    return accuracy_score(y_true, y_pred)

In [26]:
def report_withrecall(y_true, y_pred, print_report = False):

    #print(classification_report(y_true, y_pred)) # print classification report

    return recall_score(y_true, y_pred) # return accuracy score

In [27]:
def report_withprecision(y_true, y_pred, print_report = False):

    #print(classification_report(y_true, y_pred)) # print classification report

    return precision_score(y_true, y_pred) # return accuracy score

In [38]:
def nested_CV(X, y, estimator, param_grid, num_cv = 10):
    
    gs = GridSearchCV(estimator = estimator,
                     param_grid = param_grid,
                     scoring = 'recall',
                     cv=2)
    
    scores_acc = cross_val_score(estimator, X, y, scoring=make_scorer(report_withaccuracy), cv = num_cv)
    scores_recall = cross_val_score(estimator, X, y, scoring=make_scorer(report_withrecall), cv = num_cv)
    #scores_prec = cross_val_score(gs_LR, X_transformed, y, scoring=make_scorer(report_withprecision), cv = 10)
    
    #print('ML Algorithm: %s' %(estimator.__name__))
    print("Average Performance Measures Across %i Folds" %(num_cv))
    print('CV Accuracy : %.3f +/- %.3f' %(np.mean(scores_acc), np.std(scores_acc)))
    print('CV Recall : %.3f +/- %.3f' %(np.mean(scores_recall), np.std(scores_recall)))
    #print('CV Precision : %.3f +/- %.3f' %(np.mean(scores_prec), np.std(scores_prec)))
    
    return estimator.__class__.__name__, scores_acc

In [99]:
results = {}

In [90]:
def track_results(name,acc, results):
    results[name] = acc
    return results

In [100]:
param_grid = {'solver':['lbfgs', 'liblinear'],
              'max_iter': [10,20,30], 
              'class_weight': ['balanced']}

print("Logistic Regression")
names, accs = nested_CV(X, y, LogisticRegression(), param_grid, 11)
track_results(names,accs, results)

Logistic Regression
Average Performance Measures Across 11 Folds
CV Accuracy : 0.841 +/- 0.179
CV Recall : 0.212 +/- 0.269


{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049])}

In [101]:
param_grid = {'C':np.logspace(-2, 3, num=6, base=10),
              'kernel': ['rbf'],
              'gamma' : [0.1, 1, 10],
              'class_weight': ['balanced']}

print("Support Vector Machine")
names, accs = nested_CV(X, y, SVC(), param_grid, 5)
track_results(names,accs, results)

Support Vector Machine
Average Performance Measures Across 5 Folds
CV Accuracy : 0.581 +/- 0.262
CV Recall : 0.032 +/- 0.062


{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025])}

In [102]:
param_grid = {'max_depth':[3,5,7],
              'min_samples_split':[100, 1000],
              'max_features': [4,6,8],
              'max_leaf_nodes':[10,20],
              'class_weight': ['balanced']}

print("Decision Tree Classifier")
names, accs = nested_CV(X, y, DecisionTreeClassifier(), param_grid, 11)
track_results(names,accs, results)

Decision Tree Classifier
Average Performance Measures Across 11 Folds
CV Accuracy : 0.273 +/- 0.225
CV Recall : 0.112 +/- 0.282


{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025]),
 'DecisionTreeClassifier': array([0.88731642, 0.40694259, 0.13805073, 0.24432577, 0.41602136,
        0.27743658, 0.21928419, 0.06837607, 0.08279915, 0.09778253,
        0.1651082 ])}

In [103]:
param_grid = {'n_neighbors':[3,5,7,9],
              'algorithm' : ['auto', 'ball_tree'],
              'weights': ['distance']}

print("KNeighborsClassifier")
names, accs = nested_CV(X, y, KNeighborsClassifier(), param_grid, 11)
track_results(names,accs, results)

KNeighborsClassifier
Average Performance Measures Across 11 Folds
CV Accuracy : 0.735 +/- 0.219
CV Recall : 0.156 +/- 0.279


{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025]),
 'DecisionTreeClassifier': array([0.88731642, 0.40694259, 0.13805073, 0.24432577, 0.41602136,
        0.27743658, 0.21928419, 0.06837607, 0.08279915, 0.09778253,
        0.1651082 ]),
 'KNeighborsClassifier': array([0.88731642, 0.88384513, 0.88277704, 0.88331108, 0.86275033,
        0.8835781 , 0.85042735, 0.51842949, 0.40918803, 0.77264227,
        0.25353994])}

In [104]:
param_grid = {'solver':['lsqr','eigen'],
              'n_components':[5,10,15],
              'shrinkage': [0,.5,.9]}

print("LinearDiscriminantAnalysis")
names, accs = nested_CV(X, y, LinearDiscriminantAnalysis(), param_grid, 11)
track_results(names,accs, results)

LinearDiscriminantAnalysis
Average Performance Measures Across 11 Folds
CV Accuracy : 0.805 +/- 0.214
CV Recall : 0.292 +/- 0.340


{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025]),
 'DecisionTreeClassifier': array([0.88731642, 0.40694259, 0.13805073, 0.24432577, 0.41602136,
        0.27743658, 0.21928419, 0.06837607, 0.08279915, 0.09778253,
        0.1651082 ]),
 'KNeighborsClassifier': array([0.88731642, 0.88384513, 0.88277704, 0.88331108, 0.86275033,
        0.8835781 , 0.85042735, 0.51842949, 0.40918803, 0.77264227,
        0.25353994]),
 'LinearDiscriminantAnalysis': array([0.88731642, 0.88731642, 0.88758344, 0.88811749, 0.9012016 ,
        0.89799733, 0.90945513, 0.91800214, 0.44524573, 0.96259685,
        0.27464601])}

In [105]:
param_grid = {'alpha':np.logspace(-2, 3, num=6, base=10)}

print("BernoulliNB")
names, accs = nested_CV(X, y, BernoulliNB(), param_grid, 11)
track_results(names,accs, results)

BernoulliNB
Average Performance Measures Across 11 Folds
CV Accuracy : 0.799 +/- 0.165
CV Recall : 0.264 +/- 0.240


{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025]),
 'DecisionTreeClassifier': array([0.88731642, 0.40694259, 0.13805073, 0.24432577, 0.41602136,
        0.27743658, 0.21928419, 0.06837607, 0.08279915, 0.09778253,
        0.1651082 ]),
 'KNeighborsClassifier': array([0.88731642, 0.88384513, 0.88277704, 0.88331108, 0.86275033,
        0.8835781 , 0.85042735, 0.51842949, 0.40918803, 0.77264227,
        0.25353994]),
 'LinearDiscriminantAnalysis': array([0.88731642, 0.88731642, 0.88758344, 0.88811749, 0.9012016 ,
        0.89799733, 0.90945513, 0.91800214, 0.44524573, 0.96259685,
        0.27464601]),
 'BernoulliNB': array([0.88731642, 0.88731642, 0.88998665, 0.89826435, 0.91081442,
        0.90013351, 0.89903846, 0.78899573, 0.34375   , 0.72268234,
        0.65695966])}

In [106]:
param_grid = {'var_smoothing':[.2,.4,.6,.8]}

print("GaussianNB")
names, accs = name, acc = nested_CV(X, y, GaussianNB(), param_grid, 11)
track_results(names,accs, results)

GaussianNB
Average Performance Measures Across 11 Folds
CV Accuracy : 0.748 +/- 0.278
CV Recall : 0.489 +/- 0.421


{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025]),
 'DecisionTreeClassifier': array([0.88731642, 0.40694259, 0.13805073, 0.24432577, 0.41602136,
        0.27743658, 0.21928419, 0.06837607, 0.08279915, 0.09778253,
        0.1651082 ]),
 'KNeighborsClassifier': array([0.88731642, 0.88384513, 0.88277704, 0.88331108, 0.86275033,
        0.8835781 , 0.85042735, 0.51842949, 0.40918803, 0.77264227,
        0.25353994]),
 'LinearDiscriminantAnalysis': array([0.88731642, 0.88731642, 0.88758344, 0.88811749, 0.9012016 ,
        0.89799733, 0.90945513, 0.91800214, 0.44524573, 0.96259685,
        0.27464601]),
 'BernoulliNB': array([0.88731642, 0.88731642, 0.88998665, 0.89826435, 0.91081442,
        0.90013351, 0.89903846, 0.78899573, 0.34375   , 0.72268234,
        0.65695966]),
 'GaussianNB': a

In [107]:
results

{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025]),
 'DecisionTreeClassifier': array([0.88731642, 0.40694259, 0.13805073, 0.24432577, 0.41602136,
        0.27743658, 0.21928419, 0.06837607, 0.08279915, 0.09778253,
        0.1651082 ]),
 'KNeighborsClassifier': array([0.88731642, 0.88384513, 0.88277704, 0.88331108, 0.86275033,
        0.8835781 , 0.85042735, 0.51842949, 0.40918803, 0.77264227,
        0.25353994]),
 'LinearDiscriminantAnalysis': array([0.88731642, 0.88731642, 0.88758344, 0.88811749, 0.9012016 ,
        0.89799733, 0.90945513, 0.91800214, 0.44524573, 0.96259685,
        0.27464601]),
 'BernoulliNB': array([0.88731642, 0.88731642, 0.88998665, 0.89826435, 0.91081442,
        0.90013351, 0.89903846, 0.78899573, 0.34375   , 0.72268234,
        0.65695966]),
 'GaussianNB': a

In [109]:
results

{'LogisticRegression': array([0.88731642, 0.88731642, 0.88731642, 0.88731642, 0.88731642,
        0.89105474, 0.90758547, 0.92307692, 0.87847222, 0.93614748,
        0.27705049]),
 'SVC': array([0.8873513 , 0.54005827, 0.86501578, 0.2198616 , 0.39189025]),
 'DecisionTreeClassifier': array([0.88731642, 0.40694259, 0.13805073, 0.24432577, 0.41602136,
        0.27743658, 0.21928419, 0.06837607, 0.08279915, 0.09778253,
        0.1651082 ]),
 'KNeighborsClassifier': array([0.88731642, 0.88384513, 0.88277704, 0.88331108, 0.86275033,
        0.8835781 , 0.85042735, 0.51842949, 0.40918803, 0.77264227,
        0.25353994]),
 'LinearDiscriminantAnalysis': array([0.88731642, 0.88731642, 0.88758344, 0.88811749, 0.9012016 ,
        0.89799733, 0.90945513, 0.91800214, 0.44524573, 0.96259685,
        0.27464601]),
 'BernoulliNB': array([0.88731642, 0.88731642, 0.88998665, 0.89826435, 0.91081442,
        0.90013351, 0.89903846, 0.78899573, 0.34375   , 0.72268234,
        0.65695966]),
 'GaussianNB': a

In [113]:
for i in results.items():
    print(len(i[1]))

11
5
11
11
11
11
11


In [108]:
pd.DataFrame.from_dict(results)

ValueError: arrays must all be same length

In [None]:
#model_df['Mean Accuracy'] = [np.mean(i) for i in model_df['Accuracy']]
#model_df['Stdev Accuracy'] = [np.std(i) for i in model_df['Accuracy']]

In [62]:
sns.boxplot(x="CLF", y="Accuracy", data=pd.DataFrame.from_dict(results)))

ValueError: Neither the `x` nor `y` variable appears to be numeric.