In [45]:
import sys

sys.path.append("../libs")


CONFIG = {
    "model_path": "../data/model/model_calibrated.joblib",
    "features_path": "../data/model/baseline_features_list.json",
    "baseline_params": {
        "iterations": 1000,
        "depth": 12,
        "l2_leaf_reg": 6.746177457576027,
        "bagging_temperature": 0.8742701385986703,
        "random_strength": 1.396267713352505e-06,
        "colsample_bylevel": 0.9806866307913759,
        "subsample": 0.8259950282990149,
        "auto_class_weights": "SqrtBalanced",
        "learning_rate": 0.03666539436503066,
    },
        "features_to_remove": [
            'loan_status', 'int_rate', 'debt_settlement_flag',
               "last_pymnt_d",
    "last_pymnt_amnt",
    "recoveries",
    "collection_recovery_fee",
    "out_prncp",
    "out_prncp_inv",
    "total_pymnt",
    "total_pymnt_inv",
    "total_rec_prncp",
    "total_rec_int",
    "total_rec_late_fee",
    "funded_amnt_inv",
    #"int_rate",
    "acc_now_delinq",
    "grade", "sub_grade",
    "delinq_amnt"]

}

In [46]:
import pandas as pd

df = pd.read_parquet('../data/lending_club_case_train_dataset.parquet')
df_test = pd.read_parquet('../data/lending_club_case_case_test_dataset.parquet')

In [47]:
import json
import joblib

model = joblib.load(CONFIG["model_path"])

with open(CONFIG["features_path"], 'r') as f:
    features = json.load(f)

features

['funded_amnt',
 'term',
 'installment',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'purpose',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'earliest_cr_line',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'application_type',
 'annual_inc_joint',
 'dti_joint',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl'

In [48]:
df['time_of_banking_days'] = (df['issue_d'] - df['earliest_cr_line']).dt.days
df['time_of_banking_months'] = (df['issue_d'].dt.year - df['earliest_cr_line'].dt.year) * 12 + (df['issue_d'].dt.month - df['earliest_cr_line'].dt.month)

df_test['time_of_banking_days'] = (df_test['issue_d'] - df_test['earliest_cr_line']).dt.days
df_test['time_of_banking_months'] = (df_test['issue_d'].dt.year - df_test['earliest_cr_line'].dt.year) * 12 + (df_test['issue_d'].dt.month - df_test['earliest_cr_line'].dt.month)

df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]


In [49]:
cat_features = [
    "emp_length",
    "home_ownership",
    "verification_status",
    "purpose",
    "addr_state",
    "application_type",
]

In [50]:
df = df.drop(columns=CONFIG['features_to_remove'])

In [51]:
df = df[features + ["default"]]
df[cat_features] = df[cat_features].astype(str).fillna('missing')

df_test = df_test[features]
df_test[cat_features] = df_test[cat_features].astype(str).fillna('missing')


In [52]:
X_train = df.drop(columns=["default"])
y_train = df["default"]


In [53]:
X_train

Unnamed: 0,funded_amnt,term,installment,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,...,sec_app_collections_12_mths_ex_med,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,time_of_banking_days,time_of_banking_months
0,5000,36,162.87500,10+ years,RENT,24000.0,Verified,credit_card,AZ,27.656250,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,9830.0,323.0
1,2500,60,59.84375,< 1 year,RENT,30000.0,Source Verified,car,GA,1.000000,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,4627.0,152.0
2,2400,36,84.31250,10+ years,RENT,12252.0,Not Verified,small_business,IL,8.718750,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,3682.0,121.0
3,10000,36,339.25000,10+ years,RENT,49200.0,Source Verified,other,CA,20.000000,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,5782.0,190.0
4,3000,60,67.81250,1 year,RENT,80000.0,Source Verified,other,OR,17.937500,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,5813.0,191.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925487,24000,60,690.50000,< 1 year,RENT,107000.0,Source Verified,other,CO,11.648438,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,8036.0,264.0
2925488,10000,36,313.25000,10+ years,MORTGAGE,65000.0,Source Verified,debt_consolidation,PA,19.546875,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,8613.0,283.0
2925489,10050,36,358.25000,8 years,RENT,37000.0,Not Verified,debt_consolidation,VA,20.562500,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,8736.0,287.0
2925490,6000,36,197.75000,5 years,RENT,41000.0,Source Verified,credit_card,NY,19.984375,...,-1,-1,-1.0,-1,-1,-1.0,-1.0,-1.0,9832.0,323.0


In [54]:
from catboost import CatBoostClassifier


baseline_params = CONFIG['baseline_params']
baseline_params['cat_features'] = cat_features
model = CatBoostClassifier(**baseline_params)

print('Traning the model...')
model.fit(X_train, y_train,)

Traning the model...
0:	learn: 0.6526530	total: 1.13s	remaining: 18m 51s
1:	learn: 0.6165469	total: 1.9s	remaining: 15m 49s
2:	learn: 0.5845296	total: 2.65s	remaining: 14m 39s
3:	learn: 0.5562605	total: 3.6s	remaining: 14m 56s
4:	learn: 0.5309910	total: 4.5s	remaining: 14m 56s
5:	learn: 0.5074704	total: 5.4s	remaining: 14m 54s
6:	learn: 0.4862697	total: 6.1s	remaining: 14m 26s
7:	learn: 0.4668828	total: 7.34s	remaining: 15m 10s
8:	learn: 0.4504073	total: 8.29s	remaining: 15m 12s
9:	learn: 0.4347921	total: 9.22s	remaining: 15m 13s
10:	learn: 0.4217399	total: 10.2s	remaining: 15m 18s
11:	learn: 0.4104133	total: 11.3s	remaining: 15m 29s
12:	learn: 0.3999556	total: 12.2s	remaining: 15m 27s
13:	learn: 0.3904476	total: 13s	remaining: 15m 17s
14:	learn: 0.3816767	total: 14s	remaining: 15m 17s
15:	learn: 0.3741638	total: 14.7s	remaining: 15m 4s
16:	learn: 0.3672930	total: 15.4s	remaining: 14m 49s
17:	learn: 0.3612182	total: 16.3s	remaining: 14m 49s
18:	learn: 0.3552960	total: 17s	remaining: 14

<catboost.core.CatBoostClassifier at 0x326966e10>

In [56]:
#df_test = df_test.drop(columns=CONFIG['features_to_remove'])
X = df_test.copy()

In [57]:
df_test["default_prob"] = model.predict_proba(X)[:, 1]
df_test["default"] = model.predict(X)

In [62]:
df_test.to_parquet("../data/qfriends.parquet")

In [61]:
df_test.head()

Unnamed: 0,funded_amnt,term,installment,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,...,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,time_of_banking_days,time_of_banking_months,default_prob,default
472065,14100,36,552.0,< 1 year,OWN,128000.0,Source Verified,debt_consolidation,NC,14.117188,...,-1.0,-1,-1,-1.0,-1.0,-1.0,4656,153,0.00474,0.0
472066,20000,60,466.0,10+ years,RENT,92000.0,Not Verified,other,NY,11.398438,...,-1.0,-1,-1,-1.0,-1.0,-1.0,5630,185,0.955386,1.0
472067,28625,60,788.0,10+ years,RENT,150000.0,Not Verified,debt_consolidation,NY,7.878906,...,-1.0,-1,-1,-1.0,-1.0,-1.0,8735,287,0.049831,0.0
472068,20000,60,424.25,10+ years,MORTGAGE,69000.0,Source Verified,debt_consolidation,IN,21.953125,...,-1.0,-1,-1,-1.0,-1.0,-1.0,4899,161,0.325084,0.0
472069,15000,60,376.5,4 years,RENT,50000.0,Source Verified,debt_consolidation,CA,12.648438,...,-1.0,-1,-1,-1.0,-1.0,-1.0,10286,338,0.016357,0.0
