In [27]:
import sys

sys.path.append("../libs")


CONFIG = {
    "model_path": "../data/model/model_baseline.joblib",
    "features_path": '../data/model/baseline_features_list.json'

}

## Load Model

In [28]:
import joblib

model = joblib.load(CONFIG["model_path"])

In [29]:
import json

with open(CONFIG["features_path"], 'r') as f:
    features = json.load(f)
features

['funded_amnt',
 'term',
 'installment',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'verification_status',
 'purpose',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'application_type',
 'annual_inc_joint',
 'dti_joint',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths

## Predict

In [30]:
import pandas as pd


df  = pd.read_parquet('../data/lending_club_case_case_test_dataset.parquet')

In [31]:
df['time_of_banking_days'] = (df['issue_d'] - df['earliest_cr_line']).dt.days
df['time_of_banking_months'] = (df['issue_d'].dt.year - df['earliest_cr_line'].dt.year) * 12 + (df['issue_d'].dt.month - df['earliest_cr_line'].dt.month)

In [35]:
cat_features = [
    "emp_length",
    "home_ownership",
    "verification_status",
    "purpose",
    "addr_state",
    "application_type",
]

In [32]:
X = df[features]

In [36]:
X[cat_features] = X[cat_features].astype(str).fillna('missing')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_features] = X[cat_features].astype(str).fillna('missing')


In [40]:
df["default_prob"] = model.predict_proba(X)[:, 1]

In [42]:
df['principal'] = df['loan_amnt'] * (1 + df["int_rate"])


In [43]:
df['ead'] = df['principal'] - df.total_pymnt
df['ead_ratio'] = df['ead'] / df['principal']

# Loss Given Default (LGD): 1 - recoveries ratio
df['lgd_ratio'] = 1 - (df['recoveries'] / df['ead'])

# Final Losses
df['losses'] = (
    df['principal']
    - df.total_pymnt
    + df['recoveries']
)

## Write

In [41]:
df.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,...,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,time_of_banking_days,time_of_banking_months,default_prob
472065,130607974,14100,14100,14096.0,36,0.238647,552.0,E,E2,Sr Clinical Research Associate,...,-1,-1,,-1.0,-1.0,-1.0,N,4656,153,0.012474
472066,130932509,20000,20000,20000.0,60,0.140747,466.0,C,C3,TEACHER,...,-1,-1,,-1.0,-1.0,-1.0,N,5630,185,0.975943
472067,130938452,28625,28625,28624.0,60,0.218506,788.0,D,D5,Psychologist,...,-1,-1,,-1.0,-1.0,-1.0,N,8735,287,0.077849
472068,130950844,20000,20000,20000.0,60,0.099182,424.25,B,B2,Manager,...,-1,-1,,-1.0,-1.0,-1.0,N,4899,161,0.533055
472069,130966126,15000,15000,15000.0,60,0.174683,376.5,D,D1,CCAP Administrator,...,-1,-1,,-1.0,-1.0,-1.0,N,10286,338,0.060284
