## Regularization and Hyperparamter Optimization

In [209]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

%matplotlib inline
sns.set_style('whitegrid')

In [210]:
def select(query):
    
    conn = sqlite3.connect('./data/lending-club-loan-data/database2.sqlite')
    cursor = conn.cursor()
    temp_df = pd.DataFrame(cursor.execute(query).fetchall())
    temp_df.columns = list(map(lambda x: x[0], cursor.description))
    conn.close()
    
    return temp_df.copy()

In [211]:
features_train = select('SELECT * FROM FEATURES_TRAIN')
targets_train = select('SELECT * FROM TARGETS_TRAIN')
features_test = select('SELECT * FROM FEATURES_TEST')
targets_test = select('SELECT * FROM TARGETS_TEST')

In [212]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer

In [213]:
# our scoring functions will all focus on the negative class -- we already know the model performs well on the positive class
# we are looking to optimize model performance on the negative examples. we will eventually have to consider model
# performance on the positive class as well -- can't sacrifice too much of the predictability on positives

# F1 score on the negative class may intuitively seem to be the best metric to optimize for, but in the context of loan
# portfolio optimization, it may be more important to detect bad loans -- i.e. maximize recall on the negative class,
# even if it means reduced precision and F1 (classifying many good loans as bad)

def neg_f1(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    precision = tn/(tn+fn)
    recall = tn/(tn+fp)
    return 2*(precision*recall)/(precision+recall)

def neg_precision(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    return tn/(tn+fn)

def neg_recall(targets_true,targets_predicted):
    tn, fp, fn, tp = confusion_matrix(targets_true,targets_predicted).ravel()
    return tn/(tn+fp)

---
### Logistic Regression

In [214]:
lr = LogisticRegression().fit(features_train,targets_train.loan_status)
neg_f1(targets_test.loan_status,lr.predict(features_test))

0.00044777790216052836

In [215]:
# optimize for F1, precision, recall

params = {'C':[.001,.01,.1,1,10,100]}

lr = LogisticRegression()

clfF1 = GridSearchCV(lr,param_grid=params,scoring=make_scorer(neg_f1),return_train_score=True)\
    .fit(features_train,targets_train.loan_status)
clfPREC = GridSearchCV(lr,param_grid=params,scoring=make_scorer(neg_precision),return_train_score=True)\
    .fit(features_train,targets_train.loan_status)
clfREC = GridSearchCV(lr,param_grid=params,scoring=make_scorer(neg_recall),return_train_score=True)\
    .fit(features_train,targets_train.loan_status)

In [216]:
# we see in the three cells below that regularization on logistic regression does not do much to help the performance
# of the model. the effect of the inherent data imbalance greatly limits performance

pd.DataFrame(clfF1.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_C']]

Unnamed: 0,mean_test_score,param_C
3,0.013888,1.0
0,0.012683,0.001
2,0.006705,0.1
4,0.002172,10.0
5,0.001117,100.0
1,0.000838,0.01


In [217]:
# the two best C values are the opposite edge cases.. odd behavior

pd.DataFrame(clfPREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_C']]

Unnamed: 0,mean_test_score,param_C
0,0.548325,0.001
5,0.539683,100.0
1,0.498932,0.01
2,0.480212,0.1
4,0.451,10.0
3,0.437638,1.0


In [218]:
pd.DataFrame(clfREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_C']]

Unnamed: 0,mean_test_score,param_C
3,0.007239,1.0
0,0.006596,0.001
2,0.00341,0.1
4,0.00109,10.0
5,0.000559,100.0
1,0.000419,0.01


In [219]:
# models perform with respect to F1, precision, and recall about as poorly as they did without regularization

print(neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test)))
print(neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test)))
print(neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test)))

0.000447777902161
0.466666666667
0.000223989248516


In [220]:
LRF1 = neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test))
LRPREC = neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test))
LRREC = neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test))

---
### Gaussian Naive-Bayes

In [221]:
from sklearn.naive_bayes import GaussianNB

In [222]:
# when estimated by the data (by default), class prior probabilities of GNB are the proportions of each class
# we can tinker with the prior probabilities to increase the model's sensitivity to certain classes. this should
# intuitively have a similar effect as resampling the dataset input to GNB without adjusting the default priors

# the first value in each pair is the prior probability of the negative class

params = {'priors':[[0.1,0.9],[0.2,0.8],[0.3,0.7],[0.4,0.6],[0.5,0.5],[0.6,0.4]]}

GNB = GaussianNB()

clfF1 = GridSearchCV(GNB,param_grid=params,scoring=make_scorer(neg_f1),return_train_score=True)\
    .fit(features_train,targets_train.loan_status)
clfPREC = GridSearchCV(GNB,param_grid=params,scoring=make_scorer(neg_precision),return_train_score=True)\
    .fit(features_train,targets_train.loan_status)
clfREC = GridSearchCV(GNB,param_grid=params,scoring=make_scorer(neg_recall),return_train_score=True)\
    .fit(features_train,targets_train.loan_status)

In [223]:
# for a balance between precision and recall, it seems about an even balance on prior probabilities is optimal

pd.DataFrame(clfF1.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_priors']]

Unnamed: 0,mean_test_score,param_priors
4,0.373224,"[0.5, 0.5]"
5,0.367746,"[0.6, 0.4]"
3,0.367621,"[0.4, 0.6]"
2,0.351018,"[0.3, 0.7]"
1,0.327848,"[0.2, 0.8]"
0,0.268723,"[0.1, 0.9]"


In [224]:
# precision and recall have opposite orderings of param values. it seems that we can reduce precision for greater recall
# and vice versa. indeed, this is a very similar result to what happened with undersampling

pd.DataFrame(clfPREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_priors']]

Unnamed: 0,mean_test_score,param_priors
0,0.357363,"[0.1, 0.9]"
1,0.325372,"[0.2, 0.8]"
2,0.302458,"[0.3, 0.7]"
3,0.285865,"[0.4, 0.6]"
4,0.267287,"[0.5, 0.5]"
5,0.246101,"[0.6, 0.4]"


In [225]:
pd.DataFrame(clfREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_priors']]

Unnamed: 0,mean_test_score,param_priors
5,0.728144,"[0.6, 0.4]"
4,0.619088,"[0.5, 0.5]"
3,0.515315,"[0.4, 0.6]"
2,0.418306,"[0.3, 0.7]"
1,0.330436,"[0.2, 0.8]"
0,0.215344,"[0.1, 0.9]"


In [226]:
# f1 and recall are much better than in logistic regression, as seen in the baseline model w/o resampling

print(neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test)))
print(neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test)))
print(neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test)))

0.367787323277
0.35449833395
0.722589315713


In [227]:
GNBF1 = neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test))
GNBPREC = neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test))
GNBREC = neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test))

---
### k Nearest Neighbors

In [228]:
from sklearn.neighbors import KNeighborsClassifier

In [229]:
# this takes a while to compute.. commenting to remove code from future runs

params = {'n_neighbors':list(range(1,11))}

kNN = KNeighborsClassifier()

# clfF1 = GridSearchCV(kNN,param_grid=params,scoring=make_scorer(neg_f1),return_train_score=True)\
#    .fit(features_train,targets_train.loan_status)
#clfPREC = GridSearchCV(kNN,param_grid=params,scoring=make_scorer(neg_precision),return_train_score=True)\
#    .fit(features_train,targets_train.loan_status)
#clfREC = GridSearchCV(kNN,param_grid=params,scoring=make_scorer(neg_recall),return_train_score=True)\
#    .fit(features_train,targets_train.loan_status)

In [230]:
# overall performance is much weaker than Gaussian Naive-Bayes

pd.DataFrame(clfF1.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_n_neighbors']]

Unnamed: 0,mean_test_score,param_n_neighbors
1,0.255185,2
3,0.205052,4
0,0.203513,1
5,0.152544,6
2,0.150728,3
7,0.119035,8
4,0.107831,5
9,0.091316,10
6,0.081866,7
8,0.062036,9


In [231]:
# greater number of neighbors tends to increase precision of model

pd.DataFrame(clfPREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_n_neighbors']]

Unnamed: 0,mean_test_score,param_n_neighbors
9,0.259177,10
8,0.25733,9
6,0.248811,7
7,0.246623,8
4,0.23026,5
5,0.228096,6
2,0.221454,3
3,0.218699,4
0,0.203898,1
1,0.198596,2


In [232]:
# increases in precision seem to diminish quickly on both the test set and training set

kNN1 = KNeighborsClassifier(n_neighbors=70).fit(features_train,targets_train.loan_status)
neg_precision(targets_test.loan_status,kNN1.predict(features_test))

0.33333333333333331

In [233]:
pd.DataFrame(clfREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score','param_n_neighbors']]

Unnamed: 0,mean_test_score,param_n_neighbors
1,0.356875,2
0,0.20313,1
3,0.193013,4
5,0.114589,6
2,0.114254,3
7,0.078452,8
4,0.070402,5
9,0.055422,10
6,0.048994,7
8,0.035271,9


In [234]:
print(neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test)))
print(neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test)))
print(neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test)))

0.25316759263
0.263157894737
0.354686975025


In [235]:
kNNF1 = neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test))
kNNPREC = neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test))
kNNREC = neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test))

---
### Random Forest

In [236]:
from sklearn.ensemble import RandomForestClassifier

In [237]:
# more estimators (trees) in a RF generally result in better performance, at computational expense. we leave this out
# of the grid search (too long to compute, otherwise) and isolate the other parameters
# param values are chosen somewhat haphazardly for this initial run. simply want to see trends w/ varying values
# choosing few values because this is too computationally expensive to run on more

# also commenting to remove from future runs

params = {'min_samples_split':[2,8,32],'min_samples_leaf':[1,16,32]}

RF = RandomForestClassifier()

#clfF1 = GridSearchCV(RF,param_grid=params,scoring=make_scorer(neg_f1),return_train_score=True)\
#    .fit(features_train,targets_train.loan_status)
#clfPREC = GridSearchCV(RF,param_grid=params,scoring=make_scorer(neg_precision),return_train_score=True)\
#    .fit(features_train,targets_train.loan_status)
#clfREC = GridSearchCV(RF,param_grid=params,scoring=make_scorer(neg_recall),return_train_score=True)\
#    .fit(features_train,targets_train.loan_status)

In [238]:
# for both F1 and recall, it seems tweaking the min samples does not help model performance 
# (in fact it seems to greatly reduce it..) the default values of 1 and 2 seem best

pd.DataFrame(clfF1.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score',
                                                                                'param_min_samples_leaf',
                                                                                'param_min_samples_split']]

Unnamed: 0,mean_test_score,param_min_samples_leaf,param_min_samples_split
0,0.214364,1,2
1,0.151885,1,8
2,0.100325,1,32
4,0.073562,16,8
5,0.070909,16,32
3,0.070814,16,2
7,0.050522,32,8
6,0.0498,32,2
8,0.045941,32,32


In [239]:
# for precision, however, adjusting the min samples in a leaf seems to greatly increase the model's performance

pd.DataFrame(clfPREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score',
                                                                                'param_min_samples_leaf',
                                                                                'param_min_samples_split']]

Unnamed: 0,mean_test_score,param_min_samples_leaf,param_min_samples_split
8,0.488115,32,32
6,0.478883,32,2
7,0.475752,32,8
3,0.468616,16,2
5,0.467752,16,32
4,0.464399,16,8
2,0.42908,1,32
1,0.374922,1,8
0,0.336879,1,2


In [240]:
# the precision performance jumps around quite a bit as we vary minimum samples in leaeves, 
# but we are able to achieve an 'OK' level of precision (around ~0.5) on the test set..not bad

RF1 = RandomForestClassifier(min_samples_leaf=60,random_state=10).fit(features_train,targets_train.loan_status)
print(neg_precision(targets_test.loan_status,RF1.predict(features_test)))
print((RF1.predict(features_test)-1).sum()*-1)

0.5
356


In [241]:
pd.DataFrame(clfREC.cv_results_).sort_values('mean_test_score',ascending=False)[['mean_test_score',
                                                                                'param_min_samples_leaf',
                                                                                'param_min_samples_split']]

Unnamed: 0,mean_test_score,param_min_samples_leaf,param_min_samples_split
0,0.158077,1,2
1,0.092286,1,8
2,0.056931,1,32
5,0.039352,16,32
3,0.038206,16,2
4,0.037479,16,8
6,0.026495,32,2
8,0.026355,32,32
7,0.025601,32,8


In [242]:
print(neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test)))
print(neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test)))
print(neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test)))

0.207618314736
0.479289940828
0.157464441707


In [243]:
RFF1 = neg_f1(targets_test.loan_status,clfF1.best_estimator_.predict(features_test))
RFPREC = neg_precision(targets_test.loan_status,clfPREC.best_estimator_.predict(features_test))
RFREC = neg_recall(targets_test.loan_status,clfREC.best_estimator_.predict(features_test))

## Summary

In [244]:
# visualize summary