In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

%matplotlib inline
sns.set_style('whitegrid')
sns.set_context('paper')

In [26]:
def select(query):
    
    conn = sqlite3.connect('./data/lending-club-loan-data/database2.sqlite')
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

In [27]:
features_train = select('SELECT * FROM FEATURES_TRAIN')
targets_train = select('SELECT * FROM TARGETS_TRAIN').loan_status
features_test = select('SELECT * FROM FEATURES_TEST')
targets_test = select('SELECT * FROM TARGETS_TEST').loan_status

In [28]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [29]:
# undersampling ratio, SMOTE oversampling ratio, ADASYN oversampling ratio. all 1:1

ratios = [{0:len(features_train[~targets_train.astype(bool)]),\
               1:len(features_train[~targets_train.astype(bool)])},\
         {0:len(features_train[targets_train.astype(bool)]),\
                1:len(features_train[targets_train.astype(bool)])},\
         {0:len(features_train[targets_train.astype(bool)]),\
                1:len(features_train[targets_train.astype(bool)])}]

In [37]:
# sklearn's Pipeline class requires that intermediary steps have fit and transform methods. our re-samplers
# do not have transforms, so we must define a custom pipeline for this specific application.
# the function will be fairly limited and specific to our needs for simplicity/effectiveness

# the function runs through the whole pipeline twice: without hyperparam optimization and with hyperparam optimization

# 'model' argument should be a tuple w/ the first entry as the model's name as a string; the second entry a class instance 
# model names should be: LR, GNB, KNN, or RF for the function to properly work..

# 8 results for each algorithm. note that resampling is w/o a deterministic seed, so results may vary on the same call

def Pipeline(features,targets,model,resample_ratios,metric):
    
    resamplers = [None,RandomUnderSampler,SMOTE,ADASYN]
    resampler_names = ['UnderSamp','SMOTE','ADASYN']
    output = {}
    output['model'] = model[0]
    i = 0
    
    for resampler in resamplers:
        if resampler == None:
            final_features = features.copy()
            final_targets = targets.copy()
            tn,fp,fn,tp = confusion_matrix(targets_test,model[1].fit(final_features,final_targets).predict(features_test)).ravel()
            
            if metric == 'neg_prec':
                output['neg_prec'] = tn/(tn+fn)
            elif metric == 'neg_recall':
                output['neg_recall'] = tn/(tn+fp)
            elif metric == 'pos_prec':
                output['pos_prec'] = tp/(tp+fp)
            elif metric == 'pos_recall':
                output['pos_recall'] = tp/(tp+fn)
            
        else:
            final_features, final_targets = resampler(ratio=resample_ratios[i]).fit_sample(features,targets)
            tn,fp,fn,tp = confusion_matrix(targets_test,model[1].fit(final_features,final_targets).predict(features_test)).ravel()
            
            if metric == 'neg_prec':
                output[resampler_names[i]] = tn/(tn+fn)
            elif metric == 'neg_recall':
                output[resampler_names[i]] = tn/(tn+fp)
            elif metric == 'pos_prec':
                output[resampler_names[i]] = tp/(tp+fp)
            elif metric == 'pos_recall':
                output[resampler_names[i]] = tp/(tp+fn)
                
            i+=1
    
    return output

In [31]:
# optimize hyperparameters for F1 score on negative class, since that is the area we're looking to improve

def neg_f1(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    precision = tn/(tn+fn)
    recall = tn/(tn+fp)
    return 2*(precision*recall)/(precision+recall)

def neg_precision(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    return tn/(tn+fn)

def neg_recall(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    return tn/(tn+fp)

def pos_precision(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    return tp/(tp+fp)

def pos_recall(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    return tp/(tp+fn)

# the metric to optimize for in the regularized runs of the pipeline will determine the appropriate score function

score_dict = {'neg_prec':neg_precision,'neg_recall':neg_recall,'pos_prec':pos_precision,'pos_recall':pos_recall}

In [39]:
# we will leave out RF. we saw in the analysis of regularization that changing the params from the defaults
# greatly reduces model performance across all metrics

def Pipeline1(features,targets,model,resample_ratios,metric):
    
    resamplers = [None,RandomUnderSampler,SMOTE,ADASYN]
    resampler_names = ['UnderSamp','SMOTE','ADASYN']
    output = {}
    output['model'] = 're'+model[0]
    i = 0
    
    if model[0] == 'LR':
        params = {'C':[.001,.01,.1,1,10,100]}
    elif model[0] == 'GNB': 
        params = {'priors':[[0.1,0.9],[0.2,0.8],[0.3,0.7],[0.4,0.6],[0.5,0.5],[0.6,0.4]]}
    elif model[0] == 'KNN':
        params = {'n_neighbors':list(range(3,8))}
    elif model[0] == 'RF':
        params = {'min_samples_split':[2,8,32],'min_samples_leaf':[1,16,32]}
    
    for resampler in resamplers:
        if resampler == None:
            final_features = features.copy()
            final_targets = targets.copy()
            clf = GridSearchCV(model[1],param_grid=params,scoring=make_scorer(score_dict[metric]),return_train_score=True)\
                                .fit(final_features,final_targets)
            tn,fp,fn,tp = confusion_matrix(targets_test,clf.best_estimator_.predict(features_test)).ravel()
            
            if metric == 'neg_prec':
                output['neg_prec'] = tn/(tn+fn)
            elif metric == 'neg_recall':
                output['neg_recall'] = tn/(tn+fp)
            elif metric == 'pos_prec':
                output['pos_prec'] = tp/(tp+fp)
            elif metric == 'pos_recall':
                output['pos_recall'] = tp/(tp+fn)
            
        else:
            final_features, final_targets = resampler(ratio=resample_ratios[i]).fit_sample(features,targets)
            clf = GridSearchCV(model[1],param_grid=params,scoring=make_scorer(score_dict[metric]),return_train_score=True)\
                                .fit(final_features,final_targets)
            tn,fp,fn,tp = confusion_matrix(targets_test,clf.best_estimator_.predict(features_test)).ravel()
            
            if metric == 'neg_prec':
                output[resampler_names[i]] = tn/(tn+fn)
            elif metric == 'neg_recall':
                output[resampler_names[i]] = tn/(tn+fp)
            elif metric == 'pos_prec':
                output[resampler_names[i]] = tp/(tp+fp)
            elif metric == 'pos_recall':
                output[resampler_names[i]] = tp/(tp+fn)
                
            i+=1
    
    return output

In [44]:
# defines a pipeline process at a higher level of abstraction, which runs both the unregularized and regularized 
# version of the algorithm, returning the final desired 2x8 matrix with results across different sampling methods.

# the column w/ the metric name as the column name is the un-resampled version

def FinalPipeline(model,metric,ratio_dicts):
    unregularized = Pipeline(features=features_train,targets=targets_train,\
                             model=model,resample_ratios=ratio_dicts,metric=metric)
    
    regularized = Pipeline1(features=features_train,targets=targets_train,\
                            model=model,resample_ratios=ratio_dicts,metric=metric)
    return pd.DataFrame(unregularized,index=[0]).set_index('model')\
                    .append(pd.DataFrame(regularized,index=[0]).set_index('model')).copy()

In [None]:
# as the functions are currently defined, they only provide a shallow evaluation of which algorithms perform
# best on certain metrics. the best parameters are not output...


# !!
# perhaps we want to return two metrics at a time, so we can see how much we sacrifice in a metric's performance
# in exchange for an optimal value in another metric. as defined now, FinalPipeline returns results for just one metric

In [47]:
# we see below that for logistic regression, hyperparameter optimization does little to improve the model performance
# the majority of change in the model's performance comes from resampling

FinalPipeline(('LR',LogisticRegression()),'neg_prec',ratios)

Unnamed: 0_level_0,ADASYN,SMOTE,UnderSamp,neg_prec
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LR,0.2538,0.271674,0.273748,0.5
reLR,0.256313,0.272523,0.270718,0.466667


In [50]:
# we see that regularization does not guarantee performance boosts on unseen data. particularly for logistic regression,
# whose performance barely improves with regularization in this context, we see slightly worse performance after regularization
# on 2/3 re-sampling methods

FinalPipeline(('LR',LogisticRegression()),'neg_recall',ratios)

Unnamed: 0_level_0,ADASYN,SMOTE,UnderSamp,neg_recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LR,0.726845,0.642177,0.640497,0.000224
reLR,0.724829,0.635793,0.643745,0.000224


In [48]:
# for Gaussian NB, regularization has a more notable effect on the model's performance for both the following two metrics

FinalPipeline(('GNB',GaussianNB()),'neg_prec',ratios)

Unnamed: 0_level_0,ADASYN,SMOTE,UnderSamp,neg_prec
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GNB,0.217168,0.254352,0.264051,0.325714
reGNB,0.35658,0.345704,0.340388,0.354498


In [49]:
FinalPipeline(('GNB',GaussianNB()),'neg_recall',ratios)

Unnamed: 0_level_0,ADASYN,SMOTE,UnderSamp,neg_recall
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GNB,0.730653,0.659312,0.645985,0.307985
reGNB,0.786874,0.7535,0.748236,0.722589


In [23]:
# negative precision seems to be the metric most difficult to optimize for.. we can "artifically" improve neg recall
# by increasing the model's sensitivity to negative outputs (via resampling), but it's quite difficult for 
# the model to be precise in its prediction of negative classes