### Spotchecking Models

- Objective: identify promising models
- Method: draft different models and compare them with Nested CV 
- We are not allowed to use Duration

In [7]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

In [9]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, precision_score, recall_score, average_precision_score
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.utils.fixes import signature
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
pd.options.display.float_format = '{:.2f}'.format

In [11]:
#Loading df1 after it has been augmented in iteration 1:
df = pd.read_pickle('../data/pickle_files/df_pickle')
#Excluding the duration variable as it cannot be used in our baseline
df = df.drop(columns = ['duration'])

In [12]:
#Checking dtypes have loaded correctly (should return empty index)
df.select_dtypes(exclude = ['float64', 'int64']).columns

y = df['y']
X = df.drop(columns=['y'])

In [13]:
scaler = preprocessing.StandardScaler().fit(X)
X_transformed = scaler.transform(X)

In [14]:
#X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X_transformed), y, random_state = 4)

In [15]:
#will work with numpy arrays
y = np.array(y)
X = np.array(X_transformed)

In [16]:
#Changed in version 0.20: Parameter iid will change from True to False by default in version 0.22, and will be removed in 0.24.

In [17]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer, recall_score, precision_score

In [18]:
def report_withaccuracy(y_true, y_pred, print_report = False):

    #print(classification_report(y_true, y_pred)) # print classification report

    return accuracy_score(y_true, y_pred)

In [19]:
def report_withrecall(y_true, y_pred, print_report = False):

    #print(classification_report(y_true, y_pred)) # print classification report

    return recall_score(y_true, y_pred) # return accuracy score

In [20]:
def report_withprecision(y_true, y_pred, print_report = False):

    #print(classification_report(y_true, y_pred)) # print classification report

    return precision_score(y_true, y_pred) # return accuracy score

In [21]:
def nested_CV(X, y, estimator, param_grid, num_cv = 10):
    
    gs = GridSearchCV(estimator = estimator,
                     param_grid = param_grid,
                     scoring = 'recall',
                     cv=2)
    
    scores_acc = cross_val_score(estimator, X, y, scoring=make_scorer(report_withaccuracy), cv = num_cv)
    scores_recall = cross_val_score(estimator, X, y, scoring=make_scorer(report_withrecall), cv = num_cv)
    #scores_prec = cross_val_score(gs_LR, X_transformed, y, scoring=make_scorer(report_withprecision), cv = 10)
    
    #print('ML Algorithm: %s' %(estimator.__name__))
    print("Average Performance Measures Across %i Folds" %(num_cv))
    print('CV Accuracy : %.3f +/- %.3f' %(np.mean(scores_acc), np.std(scores_acc)))
    print('CV Recall : %.3f +/- %.3f' %(np.mean(scores_recall), np.std(scores_recall)))
    #print('CV Precision : %.3f +/- %.3f' %(np.mean(scores_prec), np.std(scores_prec)))

In [22]:
param_grid = {'solver':['lbfgs', 'liblinear'],
              'max_iter': [10,20,30], 
              'class_weight': ['balanced']}

print("Logistic Regression")
nested_CV(X, y, LogisticRegression(), param_grid, 11)

Logistic Regression
Average Performance Measures Across 11 Folds
CV Accuracy : 0.835 +/- 0.178
CV Recall : 0.216 +/- 0.299


In [None]:
param_grid = {'C':np.logspace(-2, 3, num=6, base=10),
              'kernel': ['rbf'],
              'gamma' : [0.1, 1, 10],
              'class_weight': ['balanced']}

print("Support Vector Machine")
nested_CV(X, y, SVC(), param_grid, 5)

Support Vector Machine


In [None]:
param_grid = {'max_depth':[3,5,7],
              'min_samples_split':[100, 1000],
              'max_features': [4,6,8],
              'max_leaf_nodes':[10,20],
              'class_weight': ['balanced']}

print("Decision Tree Classifier")
nested_CV(X, y, DecisionTreeClassifier(), param_grid, 11)

In [None]:
param_grid = {'n_neighbors':[3,5,7,9],
              'algorithm' : ['auto', 'ball_tree'],
              'weights': ['distance']}

print("KNeighborsClassifier")
nested_CV(X, y, KNeighborsClassifier(), param_grid, 11)

In [None]:
param_grid = {'solver':['lsqr','eigen'],
              'n_components':[5,10,15],
              'shrinkage': [0,.5,.9]}

print("LinearDiscriminantAnalysis")
nested_CV(X, y, LinearDiscriminantAnalysis(), param_grid, 11)

In [None]:
param_grid = {'alpha':np.logspace(-2, 3, num=6, base=10)}

print("BernoulliNB")
nested_CV(X, y, BernoulliNB(), param_grid, 11)

In [None]:
param_grid = {'var_smoothing':[.2,.4,.6,.8]}

print("GaussianNB")
nested_CV(X, y, GaussianNB(), param_grid, 11)