# Optimizing Our Hyperparameters

Since the Random Forest and Gradient Boosting performed nearly the same, we are going to try to optimize Gradient Boosting as it is infinitely faster than Random Forest. 

We will attempt to optimize for both ROC AUC and F1 scoring and see which one improves our model the best. 

In [2]:
pip install scikit-optimize

Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/5c/87/310b52debfbc0cb79764e5770fa3f5c18f6f0754809ea9e2fc185e1b67d3/scikit_optimize-0.7.4-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 18.8MB/s eta 0:00:01[K     |████████▏                       | 20kB 1.7MB/s eta 0:00:01[K     |████████████▎                   | 30kB 2.3MB/s eta 0:00:01[K     |████████████████▎               | 40kB 1.7MB/s eta 0:00:01[K     |████████████████████▍           | 51kB 1.8MB/s eta 0:00:01[K     |████████████████████████▌       | 61kB 2.2MB/s eta 0:00:01[K     |████████████████████████████▌   | 71kB 2.4MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.0MB/s 
Collecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/35/1e/eda9fe07f752ced7afcef590e7d74390f0d9c9c0b7ff98317afbaa0697e3/pyaml-19.12.0-py2.py3-none-any.whl
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml

In [0]:
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import lightgbm as lgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import matplotlib.pyplot as plt
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
import numpy as np

In [4]:
from google.colab import drive

drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [5]:
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/datasets/MLready-lc.csv', index_col='id')
df.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,loan_amnt,term,int_rate,installment,emp_length,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,acc_now_delinq,tot_coll_amt,tot_cur_bal,mths_since_rcnt_il,total_rev_hi_lim,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,default,index,x0_A,x0_B,x0_C,x0_D,x0_E,x0_F,x0_G,x1_ANY,x1_MORTGAGE,x1_NONE,x1_OWN,x1_RENT,x2_Not Verified,x2_Source Verified,x2_Verified,x3_ANY,x3_MORTGAGE,x3_NONE,x3_OWN,x3_RENT,x4_car,x4_credit_card,x4_debt_consolidation,x4_educational,x4_home_improvement,x4_house,x4_major_purchase,x4_medical,x4_moving,x4_other,x4_renewable_energy,x4_small_business,x4_vacation,x4_wedding
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
68407277,3600.0,36,13.99,123.03,10,55000.0,5.91,0.0,675.0,679.0,1.0,30.0,-999.0,7.0,0.0,2765.0,29.7,13.0,564.0,560.0,0.0,30.0,0.0,722.0,144904.0,21.0,9300.0,4.0,20701.0,1506.0,37.2,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68355089,24700.0,36,11.99,820.28,10,65000.0,16.06,1.0,715.0,719.0,4.0,6.0,-999.0,22.0,0.0,21470.0,19.2,38.0,699.0,695.0,0.0,-999.0,0.0,0.0,204396.0,19.0,111800.0,4.0,9733.0,57830.0,27.1,0.0,113.0,192.0,2.0,2.0,4.0,2.0,-999.0,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
66310712,35000.0,60,14.85,829.9,10,110000.0,17.06,0.0,785.0,789.0,0.0,-999.0,-999.0,13.0,0.0,7802.0,11.6,17.0,679.0,675.0,0.0,-999.0,0.0,0.0,301500.0,23.0,67300.0,2.0,23192.0,54962.0,12.1,0.0,36.0,87.0,2.0,2.0,1.0,2.0,-999.0,-999.0,-999.0,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,0,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68476807,10400.0,60,22.45,289.91,3,104433.0,25.37,1.0,695.0,699.0,3.0,12.0,-999.0,12.0,0.0,21929.0,64.5,35.0,704.0,700.0,0.0,-999.0,0.0,0.0,331730.0,14.0,34000.0,10.0,27644.0,4567.0,77.5,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,0,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68426831,11950.0,36,13.44,405.18,4,34000.0,10.2,0.0,690.0,694.0,0.0,-999.0,-999.0,5.0,0.0,8822.0,68.4,6.0,759.0,755.0,0.0,-999.0,0.0,0.0,12798.0,338.0,12900.0,0.0,2560.0,844.0,91.0,0.0,338.0,54.0,32.0,32.0,0.0,36.0,-999.0,-999.0,-999.0,0.0,2.0,3.0,2.0,2.0,2.0,4.0,4.0,3.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,16900.0,12798.0,9400.0,4000.0,0,4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
X = df.drop(columns='default')
y = df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=7)

# ROC AUC Optimization

In [0]:
baseclf = lgb.LGBMClassifier()

In [10]:
baseroc_cv_scores = cross_val_score(baseclf, X_train, y_train, cv=5, scoring='roc_auc')
print(f'Baseline CV ROC AUC scores: {baseroc_cv_scores}')
print(f'Baseline mean ROC AUC scores: {baseroc_cv_scores.mean()}')

Baseline CV ROC AUC scores: [0.9597282  0.95920238 0.95919869 0.95952834 0.95913279]
Baseline mean ROC AUC scores: 0.9593580795121411


In [0]:
space = [Real(0.01, 0.5, name='learning_rate', prior='log-uniform'),
         Integer(-1, 30, name='max_depth'),
         Integer(2, 100, name='num_leaves'),
         Integer(10, 1000, name='min_data_in_leaf'),
         Real(0.1, 1.0, name='feature_fraction', prior='uniform'),
         Real(0.1, 1.0, name='subsample', prior='uniform')]

In [0]:
clf = lgb.LGBMClassifier()

In [0]:
@use_named_args(space)
def rocaucobjective(**params):
    clf.set_params(**params)
    return -np.mean(cross_val_score(clf, X_train, y_train, cv=5, n_jobs=-1, scoring="roc_auc"))


In [0]:
roc_gp = gp_minimize(rocaucobjective, space, n_calls=200)



In [0]:
print("""Best parameters:
- learning_rate=%.6f
- max_depth=%d
- num_leaves=%d
- min_data_in_leaf=%d
- feature_fraction=%.6f
- subsample=%.6f""" % (roc_gp.x[0], roc_gp.x[1], 
                            roc_gp.x[2], roc_gp.x[3], 
                            roc_gp.x[4],roc_gp.x[5]))

Best parameters:
- learning_rate=0.096394
- max_depth=30
- num_leaves=100
- min_data_in_leaf=1000
- feature_fraction=1.000000
- subsample=0.100000


In [0]:
rocclf = lgb.LGBMClassifier(learning_rate=0.096394, max_depth=30, num_leaves=100, min_data_in_leaf=1000, 
                            feature_fraction=1, subsample=.1)

In [11]:
roc_cv_scores = cross_val_score(rocclf, X_train, y_train, cv=5, scoring='roc_auc')
print(f'ROC CV scores: {roc_cv_scores}')
print(f'ROC CV mean scores: {roc_cv_scores.mean()}')
print(f'Our mean ROC AUC score improved by {roc_cv_scores.mean() - baseroc_cv_scores.mean()}')

ROC CV scores: [0.96012972 0.95968179 0.95972387 0.95994962 0.95941812]
ROC CV mean scores: 0.9597806259628487
Our mean ROC AUC score improved by 0.000422546450707606


# F1 Scoring Optimization 

In [0]:
baseclf = lgb.LGBMClassifier()

In [13]:
base_f1cv_scores = cross_val_score(baseclf, X_train, y_train, cv=5, scoring='f1')
print(f'Baseline CV F1 scores: {base_f1cv_scores}')
print(f'Baseline mean F1 scores: {base_f1cv_scores.mean()}')

Baseline CV F1 scores: [0.74892568 0.74809997 0.74877263 0.75173346 0.74817737]
Baseline mean F1 scores: 0.7491418214630101


In [0]:
f1clf = lgb.LGBMClassifier()

In [0]:
@use_named_args(space)
def f1objective(**params):
    clf.set_params(**params)
    return -np.mean(cross_val_score(f1clf, X_train_up, y_train_up, cv=5, n_jobs=-1,
                                    scoring="f1"))

In [0]:
f1_gp = gp_minimize(f1objective, space, n_calls=200)

In [0]:
print("""Best parameters:
- learning_rate=%.6f 
- max_depth=%d
- num_leaves=%d
- min_data_in_leaf=%d
- feature_fraction=%.6f
- subsample=%.6f""" % (f1_gp.x[0], f1_gp.x[1], f1_gp.x[2], f1_gp.x[3], f1_gp.x[4],f1_gp.x[5]))


In [0]:
f1clf = lgb.LGBMClassifier(learning_rate=0.373731, max_depth=23, num_leaves=81, min_data_in_leaf=570, 
                            feature_fraction=0.962619, subsample=0.1333539)


In [15]:
f1clf_cv_scores = cross_val_score(f1clf, X_train, y_train, cv=5, scoring='f1')
print(f'F1 Classifier CV F1 scores: {f1clf_cv_scores}')
print(f'F1 Classifier mean F1 scores: {f1clf_cv_scores.mean()}')
print(f'Our mean F1 score improved by {f1clf_cv_scores.mean() - base_f1cv_scores.mean()}')

F1 Classifier CV F1 scores: [0.74263706 0.73967295 0.74062307 0.74484    0.74384724]
F1 Classifier mean F1 scores: 0.7423240620005428
Our mean F1 score improved by -0.006817759462467321


In [0]:
f1clf.fit(X_train, y_train)

# Putting Classifiers To The Test

In [20]:
baseclf = lgb.LGBMClassifier()
baseclf.fit(X_train, y_train)

print ("\n\n ---Gradient Boosting Model---")
base_roc_auc = roc_auc_score(y_test, baseclf.predict(X_test))
print ("Gradient Boosting AUC = %2.2f" % base_roc_auc)
print(classification_report(y_test, baseclf.predict(X_test), digits=4))



 ---Gradient Boosting Model---
Gradient Boosting AUC = 0.85
              precision    recall  f1-score   support

           0     0.9587    0.9566    0.9576    211463
           1     0.7427    0.7524    0.7475     35214

    accuracy                         0.9275    246677
   macro avg     0.8507    0.8545    0.8526    246677
weighted avg     0.9279    0.9275    0.9276    246677



In [18]:
f1clf = lgb.LGBMClassifier(learning_rate=0.373731, max_depth=23, num_leaves=81, min_data_in_leaf=570, 
                            feature_fraction=0.962619, subsample=0.1333539)
f1clf.fit(X_train, y_train)

print ("\n\n ---Gradient Boosting Model---")
f1_roc_auc = roc_auc_score(y_test, f1clf.predict(X_test))
print ("Gradient Boosting AUC = %2.2f" % f1_roc_auc)
print(classification_report(y_test, f1clf.predict(X_test), digits=4))



 ---Gradient Boosting Model---
Gradient Boosting AUC = 0.85
              precision    recall  f1-score   support

           0     0.9577    0.9563    0.9570    211463
           1     0.7396    0.7462    0.7429     35214

    accuracy                         0.9263    246677
   macro avg     0.8487    0.8512    0.8499    246677
weighted avg     0.9266    0.9263    0.9264    246677



In [19]:
rocclf = lgb.LGBMClassifier(learning_rate=0.096394, max_depth=30, num_leaves=100, min_data_in_leaf=1000, 
                            feature_fraction=1, subsample=.1)
rocclf.fit(X_train, y_train)

print ("\n\n ---Gradient Boosting Model---")
roc_roc_auc = roc_auc_score(y_test, rocclf.predict(X_test))
print ("Gradient Boosting AUC = %2.2f" % f1_roc_auc)
print(classification_report(y_test, rocclf.predict(X_test), digits=4))



 ---Gradient Boosting Model---
Gradient Boosting AUC = 0.85
              precision    recall  f1-score   support

           0     0.9587    0.9568    0.9577    211463
           1     0.7435    0.7526    0.7480     35214

    accuracy                         0.9276    246677
   macro avg     0.8511    0.8547    0.8529    246677
weighted avg     0.9280    0.9276    0.9278    246677

