In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

%matplotlib inline
sns.set_style('whitegrid')

In [2]:
def select(query):
    
    conn = sqlite3.connect('./data/lending-club-loan-data/database2.sqlite')
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

In [3]:
loans = select('SELECT * FROM LOAN_FINAL')
loans = loans.drop('index',axis=1)

In [4]:
# separating the data into the first general problem at hand: "Matured Loans" 

loans = loans[(loans['loan_status']=='Charged Off') | (loans['loan_status']=='Fully Paid')].copy()
features = loans.drop('loan_status',axis=1).copy()
targets = loans['loan_status'].copy()

In [5]:
# map targets to 0/1 and perform train-test split on the data. test set to be untouched until final application of models

outputmap = {'Charged Off':0,'Fully Paid':1}
targets = targets.apply(lambda x: outputmap[x])

from sklearn.model_selection import train_test_split

features_train, features_test, targets_train, targets_test = train_test_split(features,targets,random_state=12,test_size=0.2)

In [15]:
# develop lists which split up features into numerical and non-numerical values
# should do this in the processing step...


numerical = []
strings = []

for i in range(len(loans.dtypes)):
    if (loans.dtypes[i] == 'int64') or (loans.dtypes[i] == 'float64'):
        numerical = numerical + [loans.dtypes.index[i]]
        
    if (loans.dtypes[i] == 'O') and (loans.dtypes.index[i]!='loan_status'):
        strings = strings + [loans.dtypes.index[i]]
        
numerical_all = list(numerical)

numerical_all.remove('mths_since_last_delinq')
numerical_all.remove('mths_since_last_major_derog')
numerical_all.remove('member_id')

numerical.remove('mths_since_last_delinq')
numerical.remove('mths_since_last_major_derog')
numerical.remove('member_id')
numerical.remove('delinq_2yrs')
numerical.remove('inq_last_6mths')
numerical.remove('out_prncp')
numerical.remove('out_prncp_inv')
numerical.remove('total_pymnt')
numerical.remove('total_pymnt_inv')
numerical.remove('total_rec_prncp')
numerical.remove('total_rec_int')
numerical.remove('total_rec_late_fee')
numerical.remove('recoveries')
numerical.remove('collection_recovery_fee')
numerical.remove('last_pymnt_amnt')
numerical.remove('collections_12_mths_ex_med')
numerical.remove('tot_coll_amt')
numerical.remove('mths_since_issue')
numerical.remove('mths_left')
numerical.remove('%_term_completed')

In [18]:
# groups dataset into categorically similar subsets and gathers their means for value filling
# this function is specific to this application..may be worth further generalizing

def NaN_estimator(dataframe,column):
    
    filled_dataframe = dataframe.copy()

    datasubset = dataframe[['grade','home_ownership','initial_list_status','purpose',column]].copy()
    means = datasubset.groupby(by=['grade','home_ownership','initial_list_status','purpose']).mean()

    subset_estimates = []

    for i in dataframe[dataframe[column].isnull()].index:
    
        tuple_temp = (dataframe.loc[i,'grade'],dataframe.loc[i,'home_ownership'],
                      dataframe.loc[i,'initial_list_status'],dataframe.loc[i,'purpose'])
    
        subset_estimates = subset_estimates + [means.loc[tuple_temp,column]]
    
    filled_dataframe.loc[list(dataframe[dataframe[column].isnull()].index),column] = subset_estimates
    
    return filled_dataframe.copy()

In [19]:
features_train = NaN_estimator(features_train,'revol_util')
features_train.loc[features_train[features_train.revol_util.isnull()].index,'revol_util'] = features_train.revol_util.mean()

features_test = NaN_estimator(features_test,'revol_util')
features_test.loc[features_test[features_test.revol_util.isnull()].index,'revol_util'] = features_test.revol_util.mean()

features_train = NaN_estimator(features_train,'tot_coll_amt')
features_train.loc[features_train[features_train.tot_coll_amt.isnull()].index,'tot_coll_amt'] = \
    features_train.tot_coll_amt.mean()

features_test = NaN_estimator(features_test,'tot_coll_amt')
features_test.loc[features_test[features_test.tot_coll_amt.isnull()].index,'tot_coll_amt'] = \
    features_test.tot_coll_amt.mean()

In [33]:
# training data has approx 1:5 ratio of minority to majority class examples

ratio = -1*((targets_train-1).sum())/targets_train.sum()
-1*((targets_train-1).sum())/targets_train.sum()

0.2153942473241268

In [44]:
# ratio_dict1: undersample the majority class to create a 1:1 ratio of minority to majority class examples
# ratio_dict2: undersample the majority class to create a 2:1 ratio of minority to majority class examples

from imblearn.under_sampling import RandomUnderSampler

ratio_dict1 = {0:len(features_train[~targets_train.astype(bool)]),1:len(features_train[~targets_train.astype(bool)])}
ratio_dict2 = {0:len(features_train[~targets_train.astype(bool)]),1:round((.5*len(features_train[~targets_train.astype(bool)])))}

features_res1, targets_res1 = RandomUnderSampler(ratio=ratio_dict1,random_state=2)\
    .fit_sample(features_train[numerical],targets_train)
    
features_res2, targets_res2 = RandomUnderSampler(ratio=ratio_dict2,random_state=2)\
    .fit_sample(features_train[numerical],targets_train)

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## 1:1 minority to majority class undersampling

In [55]:
# accuracy considerably drops -- this same baseline model in pt1 had 82% accuracy on the test set

lr = LogisticRegression()
lr.fit(features_res1,targets_res1)
print(accuracy_score(lr.predict(features_test[numerical]),targets_test))

0.641775963863


In [57]:
# precision drops from 50% to 28%, but this model in pt1 only made 4 negative class predictions. there remains room for
# improvement in precision, but model has become much more sensitive to outputting the minority class
# recall is GREATLY improved -- this model in pt1 has close to 0% recall on the test set

print('TRAIN'+classification_report(targets_res1,lr.predict(features_res1)))
print('TEST'+classification_report(targets_test,lr.predict(features_test[numerical])))

print('test set negative class predictions: '+str((lr.predict(features_test[numerical])-1).sum()*-1))
print('test set positive class predictions: '+str((lr.predict(features_test[numerical])).sum()))

TRAIN             precision    recall  f1-score   support

          0       0.65      0.64      0.64     35780
          1       0.64      0.65      0.65     35780

avg / total       0.64      0.64      0.64     71560

TEST             precision    recall  f1-score   support

          0       0.28      0.63      0.38      8929
          1       0.89      0.64      0.75     41545

avg / total       0.78      0.64      0.68     50474

test set negative class predictions: 20448
test set positive class predictions: 30026


## 2:1 minority to majority class undersampling

In [58]:
# accuracy is concerningly low..perhaps we are sacrificing too much in the recall of positive examples

lr2 = LogisticRegression()
lr2.fit(features_res2,targets_res2)
print(accuracy_score(lr2.predict(features_test[numerical]),targets_test))

0.278341324246


In [61]:
# recall on the negative class is extremely good, but we have sacrificed too much predictability on the majority class
# (the negative class precision is quite low on the test set..the model is now insensitive to positive predictions)

# there seems to be a balance that we can achieve between metric performance on positive/negative examples
# by changing the ratio of minority to majority class examples, but ideally we want to perform well on both

# rather than simply adjusting the ratio of class examples, we move onto other resampling techniques and applying
# different algorithms w/ hyperparameter optimization to aim for better performance on both metrics on both classes

print('TRAIN'+classification_report(targets_res2,lr2.predict(features_res2)))
print('TEST'+classification_report(targets_test,lr2.predict(features_test[numerical])))

print('test set negative class predictions: '+str((lr2.predict(features_test[numerical])-1).sum()*-1))
print('test set positive class predictions: '+str((lr2.predict(features_test[numerical])).sum()))

TRAIN             precision    recall  f1-score   support

          0       0.69      0.96      0.80     35780
          1       0.62      0.13      0.22     17890

avg / total       0.67      0.68      0.61     53670

TEST             precision    recall  f1-score   support

          0       0.19      0.96      0.32      8929
          1       0.94      0.13      0.23     41545

avg / total       0.81      0.28      0.25     50474

test set negative class predictions: 44622
test set positive class predictions: 5852
