In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

%matplotlib inline
sns.set_style('whitegrid')
sns.set_context('paper')

In [2]:
def select(query):
    
    conn = sqlite3.connect('./data/lending-club-loan-data/database2.sqlite')
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

In [3]:
features_train = select('SELECT * FROM FEATURES_TRAIN')
targets_train = select('SELECT * FROM TARGETS_TRAIN').loan_status
features_test = select('SELECT * FROM FEATURES_TEST')
targets_test = select('SELECT * FROM TARGETS_TEST').loan_status

In [4]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

In [5]:
# undersampling ratio, SMOTE oversampling ratio, ADASYN oversampling ratio. all 1:1

ratios = [{0:len(features_train[~targets_train.astype(bool)]),\
               1:len(features_train[~targets_train.astype(bool)])},\
         {0:len(features_train[targets_train.astype(bool)]),\
                1:len(features_train[targets_train.astype(bool)])},\
         {0:len(features_train[targets_train.astype(bool)]),\
                1:len(features_train[targets_train.astype(bool)])}]

In [6]:
# sklearn's Pipeline class requires that intermediary steps have fit and transform methods. our re-samplers
# do not have transforms, so we must define a custom pipeline for this specific application.
# the function will be fairly limited and specific to our needs for simplicity/effectiveness

# the function runs through the whole pipeline twice: without hyperparam optimization and with hyperparam optimization

# 'model' argument should be a tuple w/ the first entry as the model's name as a string; the second entry a class instance 
# model names should be: LR, GNB, KNN, or RF for the function to properly work..

# 8 results for each algorithm. note that resampling is w/o a deterministic seed, so results may vary on the same call

def Pipeline0(features,targets,model,resample_ratios,metric):
    
    resamplers = [None,RandomUnderSampler,SMOTE,ADASYN]
    resampler_names = ['UnderSamp','SMOTE','ADASYN']
    output = {}
    output['model'] = model[0]
    i = 0
    
    for resampler in resamplers:
        if resampler == None:
            final_features = features.copy()
            final_targets = targets.copy()
            tn,fp,fn,tp = confusion_matrix(targets_test,model[1].fit(final_features,final_targets).predict(features_test)).ravel()
            
            
            output['N_pre'] = tn/(tn+fn)
            output['N_rec'] = tn/(tn+fp)
            
            output['P_pre'] = tp/(tp+fp)
            output['P_rec'] = tp/(tp+fn)
            
        else:
            final_features, final_targets = resampler(ratio=resample_ratios[i]).fit_sample(features,targets)
            tn,fp,fn,tp = confusion_matrix(targets_test,model[1].fit(final_features,final_targets).predict(features_test)).ravel()
            
            
            output[resampler_names[i]+'_N_pre'] = tn/(tn+fn)
            output[resampler_names[i]+'_N_rec'] = tn/(tn+fp)
            
            output[resampler_names[i]+'_P_pre'] = tp/(tp+fp)
            output[resampler_names[i]+'_P_rec'] = tp/(tp+fn)
                
            i+=1
    
    return output

In [7]:
# optimize hyperparameters for F1 score on negative class, since that is the area we're looking to improve

def neg_f1(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    precision = tn/(tn+fn)
    recall = tn/(tn+fp)
    return 2*(precision*recall)/(precision+recall)

def pos_f1(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    return 2*(precision*recall)/(precision+recall)

# the metric to optimize for in the regularized runs of the pipeline will determine the appropriate score function

score_dict = {'neg':neg_f1,'pos':pos_f1}

In [8]:
# for RF, we will run grid search over max depth of the trees. we saw in the previous notebook that adjusting 
# min samples split and min samples leaf greatly reduces model performance

# we can actually just incorporate this function into pipeline0, using control flow..

def Pipeline1(features,targets,model,resample_ratios,metric):
    
    resamplers = [None,RandomUnderSampler,SMOTE,ADASYN]
    resampler_names = ['UnderSamp','SMOTE','ADASYN']
    output = {}
    output['model'] = 're'+model[0]
    i = 0
    
    if model[0] == 'LR':
        params = {'C':[.001,.01,.1,1,10,100]}
    elif model[0] == 'GNB': 
        params = {'priors':[[0.1,0.9],[0.2,0.8],[0.3,0.7],[0.4,0.6],[0.5,0.5],[0.6,0.4]]}
    elif model[0] == 'KNN':
        params = {'n_neighbors':list(range(3,8))}
    elif model[0] == 'RF':
        params = {'max_depth':[2,4,8,16,32,100]}
    
    for resampler in resamplers:
        if resampler == None:
            final_features = features.copy()
            final_targets = targets.copy()
            clf = GridSearchCV(model[1],param_grid=params,scoring=make_scorer(score_dict[metric]),return_train_score=True)\
                                .fit(final_features,final_targets)
            tn,fp,fn,tp = confusion_matrix(targets_test,clf.best_estimator_.predict(features_test)).ravel()
            
            
            output['N_pre'] = tn/(tn+fn)
            output['N_rec'] = tn/(tn+fp)
            
            output['P_pre'] = tp/(tp+fp)
            output['P_rec'] = tp/(tp+fn)
            
        else:
            final_features, final_targets = resampler(ratio=resample_ratios[i]).fit_sample(features,targets)
            clf = GridSearchCV(model[1],param_grid=params,scoring=make_scorer(score_dict[metric]),return_train_score=True)\
                                .fit(final_features,final_targets)
            tn,fp,fn,tp = confusion_matrix(targets_test,clf.best_estimator_.predict(features_test)).ravel()
            
            
            output[resampler_names[i]+'_N_pre'] = tn/(tn+fn)
            output[resampler_names[i]+'_N_rec'] = tn/(tn+fp)
            
            output[resampler_names[i]+'_P_pre'] = tp/(tp+fp)
            output[resampler_names[i]+'_P_rec'] = tp/(tp+fn)
                
            i+=1
    
    return output

In [9]:
# defines a pipeline process at a higher level of abstraction, which runs both the unregularized and regularized 
# version of the algorithm, returning the final desired 2x8 matrix with results across different sampling methods.

# the column w/ the metric name as the column name is the un-resampled version

def PipelineLayer1(model,metric,ratio_dicts):
    unregularized = Pipeline0(features=features_train,targets=targets_train,\
                             model=model,resample_ratios=ratio_dicts,metric=metric)
    
    regularized = Pipeline1(features=features_train,targets=targets_train,\
                            model=model,resample_ratios=ratio_dicts,metric=metric)
    return pd.DataFrame(unregularized,index=[0]).set_index('model')\
                    .append(pd.DataFrame(regularized,index=[0]).set_index('model')).copy()

In [10]:
# algs_list should be a list of tuples (tuples formatted as the 'model' argument are entered into the lower level pipelines)
# e.g. ('LR',LogisticRegression())

# 'metric' is changed to pos/neg. we always want to study precision and recall together. for the actual analysis,
# we would run this function twice, to view study the performance of the models on negative classes and positive classes
# we avoid returning all the metrics at once, because we're left with a table of 16 columns..too much information to
# easily digest from a human standpoint

def PipelineLayer2(model_list,metric,ratios):
    
    tempdf = pd.DataFrame()
    
    for model in model_list:
        tempdf = tempdf.append(PipelineLayer1(model,metric,ratios))
        
    return tempdf.copy()


In [11]:
# as the functions are currently defined, they only provide a shallow evaluation of which algorithms perform
# best on certain metrics. the best parameters are not output...


# !!
# perhaps we want to return two metrics at a time, so we can see how much we sacrifice in a metric's performance
# in exchange for an optimal value in another metric. as defined now, FinalPipeline returns results for just one metric

In [12]:
algs = [('LR',LogisticRegression()),('GNB',GaussianNB()),('KNN',KNeighborsClassifier()),('RF',RandomForestClassifier())]

In [13]:
# results = PipelineLayer2(algs,metric='neg',ratios=ratios)

In [14]:
# results.to_csv('./pipeline-results.csv')

In [15]:
results = pd.read_csv('./pipeline-results.csv')

In [16]:
results.set_index('model',inplace=True)

In [17]:
neg = ['N_pre','N_rec','UnderSamp_N_pre','UnderSamp_N_rec','SMOTE_N_pre','SMOTE_N_rec','ADASYN_N_pre','ADASYN_N_rec']
pos = ['P_pre','P_rec','UnderSamp_P_pre','UnderSamp_P_rec','SMOTE_P_pre','SMOTE_P_rec','ADASYN_P_pre','ADASYN_P_rec']

In [18]:
# RF does not seem to perform well when using SMOTE/ADASYN

results[neg]

Unnamed: 0_level_0,N_pre,N_rec,UnderSamp_N_pre,UnderSamp_N_rec,SMOTE_N_pre,SMOTE_N_rec,ADASYN_N_pre,ADASYN_N_rec
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR,0.5,0.000224,0.273209,0.640721,0.27346,0.630866,0.253973,0.726733
reLR,0.5,0.000224,0.273283,0.640497,0.274015,0.634786,0.254445,0.719677
GNB,0.325714,0.307985,0.266798,0.607011,0.252503,0.669392,0.217562,0.726173
reGNB,0.263459,0.608915,0.256094,0.6648,0.236426,0.750028,0.208146,0.788106
KNN,0.229395,0.070445,0.203121,0.543734,0.196971,0.413596,0.183698,0.373054
reKNN,0.215069,0.191175,0.194014,0.70635,0.195092,0.370366,0.183156,0.517303
RF,0.335203,0.1606,0.238553,0.688543,0.311747,0.223205,0.304969,0.210326
reRF,0.331065,0.152537,0.259314,0.664128,0.314198,0.20719,0.31183,0.169448


In [19]:
results[pos]

Unnamed: 0_level_0,P_pre,P_rec,UnderSamp_P_pre,UnderSamp_P_rec,SMOTE_P_pre,SMOTE_P_rec,ADASYN_P_pre,ADASYN_P_rec
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR,0.823123,0.999952,0.891379,0.633674,0.889674,0.639764,0.902102,0.541196
reLR,0.823123,0.999952,0.89136,0.633939,0.89053,0.638537,0.900749,0.546781
GNB,0.852989,0.862968,0.88365,0.641473,0.889863,0.5741,0.881718,0.438705
reGNB,0.882964,0.634132,0.890346,0.584956,0.899223,0.479384,0.886475,0.355614
KNN,0.826112,0.949139,0.846681,0.541533,0.834957,0.637598,0.826907,0.643712
reKNN,0.830218,0.850042,0.854058,0.369334,0.832294,0.671585,0.829341,0.504152
RF,0.837757,0.931544,0.887418,0.527645,0.842653,0.894091,0.840893,0.896979
reRF,0.836777,0.933759,0.891364,0.592298,0.841228,0.902804,0.837447,0.919629


In [20]:
# we see below that for logistic regression, hyperparameter optimization does little to improve the model performance
# the majority of change in the model's performance comes from resampling

# PipelineLayer1(('LR',LogisticRegression()),'neg',ratios)

In [21]:
# for Gaussian NB, regularization has a more notable effect on the model's performance for both the following two metrics

# PipelineLayer1(('GNB',GaussianNB()),'neg',ratios)

In [22]:
# negative precision seems to be the metric most difficult to optimize for.. we can "artifically" improve neg recall
# by increasing the model's sensitivity to negative outputs (via resampling), but it's quite difficult for 
# the model to be precise in its prediction of negative classes

In [24]:
len(features_train.columns)

17