In [52]:
import sys

sys.path.append("../libs")


CONFIG = {
    "model_path": "../data/model/model_baseline.joblib",
    "features_path": "../data/model/baseline_features_list.json",
    "feature_dataset_database_file": "../data/feature_store.db",
}

## Load Model

In [53]:
import joblib

model = joblib.load(CONFIG["model_path"])

In [54]:
import json

with open(CONFIG["features_path"], 'r') as f:
    features = json.load(f)
features

['funded_amnt',
 'term',
 'installment',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'purpose',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'application_type',
 'annual_inc_joint',
 'dti_joint',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl'

In [55]:
from feature_dataset import FeatureDataset
import pandas as pd
fs = FeatureDataset(CONFIG['feature_dataset_database_file'])

df_calibration: pd.DataFrame = fs.get_feature_dataset(name="calibration", version=1)

In [56]:
df_calibration['time_of_banking_days'] = (df_calibration['issue_d'] - df_calibration['earliest_cr_line']).dt.days
df_calibration['time_of_banking_months'] = (df_calibration['issue_d'].dt.year - df_calibration['earliest_cr_line'].dt.year) * 12 + (df_calibration['issue_d'].dt.month - df_calibration['earliest_cr_line'].dt.month)

In [40]:
cat_features = [
    "emp_length",
    "home_ownership",
    "verification_status",
    "purpose",
    "addr_state",
    "application_type",
]
df_calibration[cat_features] = df_calibration[cat_features].astype(str).fillna('missing')


In [61]:
import numpy as np
df_training, df_testing = np.split(df_calibration.sort_values(by='issue_d'), [int(.80 *len(df_calibration))])

In [62]:
df_training

Unnamed: 0,id,loan_amnt,funded_amnt,term,installment,emp_length,home_ownership,annual_inc,verification_status,issue_d,...,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,default,time_of_banking_days,time_of_banking_months,target
0,65682103,24000,24000,36,732.500,< 1 year,RENT,95000.0,Source Verified,2015-11-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,6179,203,0.0
1073,65924925,26300,26300,60,661.500,10+ years,MORTGAGE,71500.0,Source Verified,2015-11-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,4322,142,0.0
1072,66074800,16000,16000,60,327.250,10+ years,RENT,120000.0,Source Verified,2015-11-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,9284,305,0.0
1071,65815490,22000,22000,36,688.500,2 years,MORTGAGE,140000.0,Not Verified,2015-11-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,5052,166,0.0
1070,65787086,5000,5000,36,155.000,5 years,MORTGAGE,109000.0,Source Verified,2015-11-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,5478,180,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
366769,91612355,9000,9000,36,296.750,5 years,MORTGAGE,45000.0,Source Verified,2016-10-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,1704,56,0.0
366768,91302718,25000,25000,36,840.000,< 1 year,MORTGAGE,100000.0,Source Verified,2016-10-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,5418,178,0.0
366767,91040250,3000,3000,36,111.500,3 years,RENT,78000.0,Verified,2016-10-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,3714,122,0.0
366766,91022599,8000,8000,36,260.000,4 years,RENT,48000.0,Not Verified,2016-10-01,...,-1.0,-1,-1,-1.0,-1.0,-1.0,0.0,8827,290,0.0


In [44]:
features = features + ["target"]

In [45]:
df_training = df_training[features]
df_testing = df_testing[features]

In [46]:
X_train = df_training.drop(columns=["target"])
y_train = df_training["target"]

X_test = df_testing.drop(columns=["target"])
y_test = df_testing["target"]

In [60]:
df_training["target"]

0         0.0
1073      0.0
1072      0.0
1071      0.0
1070      0.0
         ... 
366769    0.0
366768    0.0
366767    0.0
366766    0.0
366765    0.0
Name: target, Length: 354736, dtype: float32

In [47]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss
import joblib
import json

# Platt scaling (sigmoid)
print('Fitting platt scaling calibration...')
calibrated_model_sigmoid = CalibratedClassifierCV(model, method='sigmoid')
calibrated_model_sigmoid.fit(X_train, y_train)


Fitting platt scaling calibration...


ValueError: y should be a 1d array, got an array of shape (354736, 2) instead.

In [None]:
y_pred_sigmoid = calibrated_model_sigmoid.predict_proba(X_test)[:, 1]