In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

%matplotlib inline
sns.set_style('whitegrid')

In [2]:
def select(query):
    
    conn = sqlite3.connect('./data/lending-club-loan-data/database2.sqlite')
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

In [3]:
features_train = select('SELECT * FROM FEATURES_TRAIN')
targets_train = select('SELECT * FROM TARGETS_TRAIN')
features_test = select('SELECT * FROM FEATURES_TEST')
targets_test = select('SELECT * FROM TARGETS_TEST')

In [4]:
# training data has approx 1:5 ratio of minority to majority class examples

ratio = -1*((targets_train.loan_status-1).sum())/targets_train.loan_status.sum()
ratio

0.2153942473241268

### Training Set Manipulation approach 1: Undersampling of the majority class

In [5]:
# ratio_dict1: undersample the majority class to create a 1:1 ratio of minority to majority class examples
# ratio_dict2: undersample the majority class to create a 2:1 ratio of minority to majority class examples

from imblearn.under_sampling import RandomUnderSampler

ratio_dict1 = {0:len(features_train[~targets_train.astype(bool).loan_status]),\
               1:len(features_train[~targets_train.astype(bool).loan_status])}
ratio_dict2 = {0:len(features_train[~targets_train.astype(bool).loan_status]),\
                1:round((.5*len(features_train[~targets_train.astype(bool).loan_status])))}

features_res1, targets_res1 = RandomUnderSampler(ratio=ratio_dict1,random_state=2)\
    .fit_sample(features_train,targets_train.loan_status)
    
features_res2, targets_res2 = RandomUnderSampler(ratio=ratio_dict2,random_state=2)\
    .fit_sample(features_train,targets_train.loan_status)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### 1:1 minority to majority class undersampling

In [7]:
# accuracy considerably drops -- this same baseline model in pt1 had 82% accuracy on the test set

lr = LogisticRegression()
lr.fit(features_res1,targets_res1)
print(accuracy_score(lr.predict(features_test),targets_test))

0.641775963863


In [8]:
# precision drops from 50% to 28%, but this model in pt1 only made 4 negative class predictions. there remains room for
# improvement in precision, but model has become much more sensitive to outputting the minority class
# recall is GREATLY improved -- this model in pt1 has close to 0% recall on the test set

print('TRAIN LR')
print(classification_report(targets_res1,lr.predict(features_res1)))
print('TEST LR')
print(classification_report(targets_test,lr.predict(features_test)))

TRAIN LR
             precision    recall  f1-score   support

          0       0.65      0.64      0.64     35780
          1       0.64      0.65      0.65     35780

avg / total       0.64      0.64      0.64     71560

TEST LR
             precision    recall  f1-score   support

          0       0.28      0.63      0.38      8929
          1       0.89      0.64      0.75     41545

avg / total       0.78      0.64      0.68     50474



In [9]:
# increase the number of trees in RF from default 10 to 64

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

GNB = GaussianNB().fit(features_res1,targets_res1)
KNN = KNeighborsClassifier().fit(features_res1,targets_res1)
RF = RandomForestClassifier(n_estimators=64).fit(features_res1,targets_res1)

# metrics are for the test set only, for: Gaussian NB, kNN, and Random Forest classifiers

# unlike the case without undersampling, these three models offer little performance improvement in any of the metrics
# over logistic regression. undersampling seems to greatly improve logistic regression's performance, bringing it up to par
# with other algorithms (a bit better, even). still, the other algorithms have notable performance gains from undersampling

print('GNB')
print(classification_report(targets_test,GNB.predict(features_test)))
print('kNN')
print(classification_report(targets_test,KNN.predict(features_test)))
print('RF')
print(classification_report(targets_test,RF.predict(features_test)))

GNB
             precision    recall  f1-score   support

          0       0.28      0.57      0.37      8929
          1       0.88      0.68      0.77     41545

avg / total       0.77      0.66      0.70     50474

kNN
             precision    recall  f1-score   support

          0       0.20      0.54      0.29      8929
          1       0.85      0.54      0.66     41545

avg / total       0.73      0.54      0.60     50474

RF
             precision    recall  f1-score   support

          0       0.26      0.65      0.37      8929
          1       0.89      0.61      0.72     41545

avg / total       0.78      0.62      0.66     50474



### 2:1 minority to majority class undersampling

In [10]:
# accuracy is concerningly low..perhaps we are sacrificing too much in the recall of positive examples

lr2 = LogisticRegression()
lr2.fit(features_res2,targets_res2)
print(accuracy_score(lr2.predict(features_test),targets_test))

0.278341324246


In [11]:
# recall on the negative class is extremely good, but we have sacrificed too much predictability on the majority class
# (the negative class precision is quite low on the test set..the model is now insensitive to positive predictions)

# there seems to be a balance that we can achieve between metric performance on positive/negative examples
# by changing the ratio of minority to majority class examples, but ideally we want to perform well on both

# rather than simply adjusting the ratio of class examples, we move onto other resampling techniques and applying
# different algorithms w/ hyperparameter optimization to aim for better performance on both metrics on both classes

print('TRAIN'+classification_report(targets_res2,lr2.predict(features_res2)))
print('TEST'+classification_report(targets_test,lr2.predict(features_test)))

print('test set negative class predictions: '+str((lr2.predict(features_test)-1).sum()*-1))
print('test set positive class predictions: '+str((lr2.predict(features_test)).sum()))

TRAIN             precision    recall  f1-score   support

          0       0.69      0.96      0.80     35780
          1       0.62      0.13      0.22     17890

avg / total       0.67      0.68      0.61     53670

TEST             precision    recall  f1-score   support

          0       0.19      0.96      0.32      8929
          1       0.94      0.13      0.23     41545

avg / total       0.81      0.28      0.25     50474

test set negative class predictions: 44622
test set positive class predictions: 5852


In [12]:
# note - use this formatting for cleaner output

GNB2 = GaussianNB().fit(features_res2,targets_res2)
KNN2 = KNeighborsClassifier().fit(features_res2,targets_res2)
RF2 = RandomForestClassifier(n_estimators=64).fit(features_res2,targets_res2)

print('GNB')
print(classification_report(targets_test,GNB2.predict(features_test)))
print('kNN')
print(classification_report(targets_test,KNN2.predict(features_test)))
print('RF')
print(classification_report(targets_test,RF2.predict(features_test)))

GNB
             precision    recall  f1-score   support

          0       0.24      0.75      0.37      8929
          1       0.90      0.50      0.64     41545

avg / total       0.79      0.54      0.59     50474

kNN
             precision    recall  f1-score   support

          0       0.19      0.79      0.30      8929
          1       0.86      0.27      0.41     41545

avg / total       0.74      0.36      0.39     50474

RF
             precision    recall  f1-score   support

          0       0.22      0.88      0.35      8929
          1       0.92      0.31      0.46     41545

avg / total       0.80      0.41      0.44     50474



In [13]:
# regardless of undersampling ratio, precision is fairly poor across the board

#### we do not explore simple oversampling of the minority class.. known tendency to cause models to overfit

### Training Set Manipulation Approach 2: SMOTE (Synthetic Minority Oversampling Technique)

In [14]:
from imblearn.over_sampling import SMOTE