In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

import os

#For inline plotting 
%matplotlib inline                 
%config InlineBackend.figure_format = 'svg'

%reload_ext autoreload
%autoreload 2

# set plot properties to seaborn globally
plt.style.use("seaborn-v0_8-white")

### Monitoring/Maintaining PD Model
------

Over time, the population that the model trained on might show differences with the characteristics of new borrowers. In such situations, it is crucial to re-asses the quality of the model, as it can have disastrous consequences. 

For example, we can use the number of new borrowers as benchmark for maintenance time of the model: e.g. redevelop the model after 50-100K new data points!

A widely accepted method to compare two populations is `Population Stability Index (PSI)`. It simply has the aim of answering: 

- "Is the new data too different from the original one we used to train the model?"

In practice, we need to take the new data and transform it into the form we used to build the original PD model, which in fact includes categorizing features we utilized via fine/coarse classing, e.g. using `WoE` based binning via `optbinning` library as we did earlier. 

For any feature with $k$ number of categories we can compute the `PSI` via 

$$
\textrm{PSI} = \sum_{i = 1}^{k} \left(\% \textrm{actual}_j - \% \textrm{expected}_j\right)\, \times \, \ln\left(\frac{\%\textrm{actual}_j}{\% \textrm{expected}_j}\right),
$$

where $\% \textrm{actual}_j$ represent the percentage of occurrence of the jth category in the new data. We can then take guidance from the following table to decide whether we need to redevelop our model

$\textrm{PSI} \in [0,1]$     |  `population difference`
----------------------------- | ------------------------------------------ |
$\textrm{PSI} = 0 $           |   no difference                            |
$\textrm{PSI}  <0.1 $         | little-no difference                       |
$0.1 < \textrm{PSI}<  0.25 $  | little difference (no action)              |
$\textrm{PSI}>  0.25 $        | Big difference (action should be taken)    |
$\textrm{PSI} = 1$.           | Absolute 

In [2]:
# Load the pre-processed training data: Clean, binned and dummified

current_dir = os.getcwd()
raw_data_dir = current_dir + "/raw_data/"
model_data_dir = current_dir + "/model_data/" 

loan_Xt = pd.read_csv(model_data_dir + 'loan_data_train_pp.csv', index_col = 0)
loan_Xval = pd.read_csv(model_data_dir + 'loan_data_val_pp.csv', index_col = 0)

loan_yt = pd.read_csv(model_data_dir + 'loan_data_target.csv', index_col = 0)
loan_yval = pd.read_csv(model_data_dir + 'loan_data_target_val.csv', index_col = 0)

# new data 
loan_new = pd.read_csv(raw_data_dir + 'loan_data_2015.csv', low_memory = False)

In [3]:
loan_df = loan_new.copy()

In [4]:
# we keep the reference bins as they belong to the original variables we actually keep in our model 
# drop the columns we have not used when we build our model 

drop = ['tot_coll_amt_missing','total_acc:10.50-14.50', 'total_acc:14.50-21.50', 'total_acc:21.50-25.50',
        'total_acc:25.50-33.50','total_acc:>33.50', 'total_acc:<10.50', 'emp_length:1-9', 'emp_length:10', 'emp_length:0',
        'inqlast6mths_x_dti:<8.99', 'inqlast6mths_x_dti:8.99-17.04', 'inqlast6mths_x_dti:17.04-45.13',
        'inqlast6mths_x_dti:>45.13']

loan_Xt = loan_Xt.drop(drop, axis = 1)
loan_Xval = loan_Xval.drop(drop, axis = 1)

In [5]:
# get the features we used in the PD model: need this to slice the new data

model_features = []
for col in loan_Xt.columns:
    
    model_features.append(col.split(':')[0])
    
model_features = set(model_features)

model_features

{'addr_state',
 'annual_inc',
 'dti',
 'home_ownership',
 'initial_list_status',
 'inq_last_6mths',
 'loan_to_income',
 'mths_since_earliest_cr_line',
 'purpose',
 'revol_util',
 'term_in_months',
 'tot_cur_bal',
 'tot_cur_bal_missing',
 'total_rev_hi_lim',
 'verification_status'}

To be able to compare the new data with the training data we used for the PD model, we need to pre-process it so that it is identical to the latter. Below, I perform these steps using the scripts I utilized before!

In [6]:
def parse_emp_length(val):
    # Check if the value is the string 'nan' (case insensitive) or other invalid string
    if isinstance(val, str) and val.strip().lower() == 'nan':
        return 0
    # Check for '< 1 years' or other similar cases
    if '<' in str(val):
        return 0
    # Extract digits and convert to integer
    digits = ''.join(filter(str.isdigit, str(val)))
    return int(digits) if digits else 0         # Convert to float (to handle "< 1")

In [7]:
loan_df['term_in_months'] = loan_df.term.apply(parse_emp_length)

loan_df.term_in_months.unique()

array([60, 36])

In [8]:
loan_df = loan_df.drop(columns=['term'])

In [9]:
loan_df['earliest_cr_line_dt'] = pd.to_datetime(loan_df.earliest_cr_line, format = "%b-%y")

In [10]:
loan_df.earliest_cr_line_dt.max()

Timestamp('2068-12-01 00:00:00')

In [11]:
years_to_loop = [f'201{i}' for i in range(5)]

for year in years_to_loop:
    
    print(year, len(loan_df.earliest_cr_line_dt[loan_df.earliest_cr_line_dt == year]))       

2010 396
2011 373
2012 307
2013 0
2014 0


In [12]:
loan_df.earliest_cr_line_dt = loan_df.earliest_cr_line_dt.apply(lambda d: d.replace(year=d.year - 100) if d.year > 2012 else d)

In [13]:
loan_df.earliest_cr_line_dt.max(), loan_df.earliest_cr_line_dt.min()

(Timestamp('2012-11-01 00:00:00'), Timestamp('1944-01-01 00:00:00'))

In [14]:
# drop earliest_cr_line

loan_df = loan_df.drop(columns='earliest_cr_line')

In [15]:
loan_df['issue_d_dt'] = pd.to_datetime(loan_df.issue_d, format = "%b-%y")

loan_df.issue_d_dt.max(), loan_df.issue_d_dt.min()

(Timestamp('2015-12-01 00:00:00'), Timestamp('2015-01-01 00:00:00'))

In [16]:
# drop the original 

loan_df = loan_df.drop(columns = 'issue_d')

In [17]:
# although no missing val we should add missing flag variable again 

loan_df.tot_cur_bal.isna().sum()

0

In [18]:
# missing flag variable

loan_df['tot_cur_bal_missing'] = loan_df['tot_cur_bal'].isnull().astype(int)

In [19]:
# parse object type dates as datetime
loan_df.loc[:,'earliest_cr_line_dt'] = pd.to_datetime(loan_df.earliest_cr_line_dt)
loan_df.loc[:,'issue_d_dt'] = pd.to_datetime(loan_df.issue_d_dt)

loan_df['mths_since_earliest_cr_line'] = round((loan_df.issue_d_dt - loan_df.earliest_cr_line_dt) / np.timedelta64(30, 'D'))

In [20]:
# drop the original dates
loan_df = loan_df.drop(columns=['earliest_cr_line_dt', 'issue_d_dt'])

In [21]:
# loan_to_income variable 
loan_df['loan_to_income'] = loan_df['loan_amnt'] / loan_df['annual_inc']

In [22]:
loan_df.loan_status.unique()

array(['Issued', 'Current', 'Fully Paid', 'In Grace Period',
       'Late (16-30 days)', 'Late (31-120 days)', 'Charged Off',
       'Default'], dtype=object)

In [23]:
# targets 

# We assing good loans 1, and defaulted ones 0 

loan_df['loan_status'] = np.where(loan_df.loan_status.isin(['Charged Off','Late (31-120 days)', 
                                                                             'Default']),0,1)

(loan_df.loan_status.value_counts() / loan_df.loan_status.count())

loan_status
1    0.981356
0    0.018644
Name: count, dtype: float64

In [24]:
# save the targets 

loan_df_targets = loan_df.loan_status

In [25]:
model_features = list(model_features)

loan_df = loan_df[model_features]

In [26]:
loan_df.isna().sum()

purpose                          0
verification_status              0
addr_state                       0
annual_inc                       0
home_ownership                   0
tot_cur_bal                      0
revol_util                     162
total_rev_hi_lim                 0
term_in_months                   0
initial_list_status              0
loan_to_income                   0
dti                              0
inq_last_6mths                   0
tot_cur_bal_missing              0
mths_since_earliest_cr_line      0
dtype: int64

In [27]:
# fill revol_util 

loan_df.loc[:,'revol_util'] = loan_df.revol_util.fillna(loan_df.revol_util.median())

After we are done with the pre-processing of the variables we used in the PD model, we are ready to bin and dummify them as before. 

In [28]:
reference_bins = pd.read_csv(model_data_dir + 'reference_bins.csv', index_col = 0)

In [29]:
reference_bins

Unnamed: 0,Variable,Opt_Ref_Bin
0,home_ownership,['OTHER' 'NONE' 'RENT']
1,verification_status,['Verified']
2,purpose,['small_business' 'educational' 'moving' 'house' 'other'\n 'renewable_energy' 'medical']
3,addr_state,['NE' 'IA' 'NV' 'FL' 'HI' 'AL']
4,initial_list_status,['f']
5,inq_last_6mths,"[2.50, inf)"
6,emp_length_years,"(-inf, 0.50)"
7,term_in_months,"[48.00, inf)"
8,total_rev_hi_lim_missing,"[0.50, inf)"
9,tot_coll_amt_missing,"[0.50, inf)"


There are less number of categories from some variables as compared to the training set, for example for `home_ownership` variable: In such cases we will use the available categories in the data to fill our buckets.

In [30]:
loan_df['home_ownership:OTHER-NONE-RENT'] = np.where(loan_df.home_ownership.isin(['RENT']),True,False)
loan_df['home_ownership:MORTGAGE-ANY'] = np.where(loan_df.home_ownership.isin(['MORTGAGE','ANY']),True,False)
loan_df['home_ownership:OWN'] = np.where(loan_df.home_ownership.isin(['OWN']),True,False)

loan_df = loan_df.drop('home_ownership', axis = 1)


In [31]:
loan_df.addr_state.unique()

array(['DC', 'IN', 'KS', 'IL', 'OR', 'VA', 'PA', 'GA', 'TN', 'OH', 'NY',
       'FL', 'NJ', 'RI', 'NC', 'AL', 'TX', 'SC', 'MD', 'WA', 'MO', 'MN',
       'NH', 'VT', 'AZ', 'MI', 'MA', 'CA', 'LA', 'DE', 'NM', 'CO', 'WI',
       'SD', 'CT', 'NE', 'HI', 'AR', 'MT', 'WV', 'WY', 'OK', 'NV', 'KY',
       'MS', 'ME', 'UT', 'ND', 'AK'], dtype=object)

In [32]:
expected_states_bins = ['addr_state:AR-MI-PA-OH',
       'addr_state:AZ-NJ', 'addr_state:CA-UT-KY', 'addr_state:GA-WA-OR-WI',
       'addr_state:IL-CT', 'addr_state:KS-SC-CO-VT-AK-MS-WV-NH-WY-DC-ME-ID',
       'addr_state:LA-MD-NC', 'addr_state:MN-RI-MA-DE-SD-IN',
       'addr_state:MT-TX', 'addr_state:NE-IA-NV-FL-HI-AL',
       'addr_state:NM-VA-NY-OK-TN-MO']

In [33]:
expected_states_pre = [item.split(":")[1].split("-") for item in expected_states_bins]
expected_states = [state for sub_list in expected_states_pre for state in sub_list]

In [34]:
diff = list(set(expected_states).symmetric_difference(set(loan_df.addr_state.unique().tolist())))

diff

['ND', 'ID', 'IA']

Again, we are missing `IA` (Iowa), `ID`(Idaho) in the new data but have `ND` (North Dakota) in the new

In [35]:
dummies_state_df = pd.get_dummies(loan_df.addr_state, prefix = 'addr_state', prefix_sep=':')

In [36]:
dummies_state_df['addr_state:IA'] = False
dummies_state_df['addr_state:ID'] = False

In [37]:
loan_df['addr_state:NE-IA-NV-FL-HI-AL'] = sum([dummies_state_df['addr_state:NE'], dummies_state_df['addr_state:IA'], 
                                              dummies_state_df['addr_state:NV'],dummies_state_df['addr_state:FL'],
                                              dummies_state_df['addr_state:HI'],dummies_state_df['addr_state:AL']
                                              ,dummies_state_df['addr_state:ND']])
test_bin = 'addr_state:KS-SC-CO-VT-AK-MS-WV-NH-WY-DC-ME-ID'
loan_df[test_bin] = sum([dummies_state_df[f"addr_state:{state}"] for state in test_bin.split(":")[1].split("-")])

In [38]:
rest_of_the_states_bins = ['addr_state:AR-MI-PA-OH','addr_state:AZ-NJ', 'addr_state:CA-UT-KY', 
                           'addr_state:GA-WA-OR-WI', 'addr_state:IL-CT','addr_state:LA-MD-NC', 
                           'addr_state:MN-RI-MA-DE-SD-IN','addr_state:MT-TX','addr_state:NM-VA-NY-OK-TN-MO']

for bin in rest_of_the_states_bins:
    
    loan_df[bin] = sum([dummies_state_df[f"addr_state:{state}"] for state in bin.split(":")[1].split("-")])

In [39]:
# drop the original 

loan_df = loan_df.drop('addr_state', axis = 1)

In [40]:
other_qual_categoricals = ['verification_status', 'initial_list_status', 'purpose']
for it in loan_Xt.columns.values.tolist():
    
    if it.split(":")[0] in other_qual_categoricals[0]:
        
        loan_df[it] = np.where(loan_df[it.split(":")[0]] == " ".join(it.split(":")[1].split("-")), True, False)
    
    elif it.split(":")[0] in other_qual_categoricals[1:]:
        
        loan_df[it] = np.where(loan_df[it.split(":")[0]].isin(it.split(":")[1].split('-')), True, False)

In [41]:
# drop the originals 

loan_df = loan_df.drop(other_qual_categoricals, axis = 1)  

In [42]:
to_remove = other_qual_categoricals + ['addr_state','home_ownership']

numerical_features = [feat for feat in model_features if feat not in to_remove]

loan_df[numerical_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421094 entries, 0 to 421093
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   annual_inc                   421094 non-null  float64
 1   tot_cur_bal                  421094 non-null  float64
 2   revol_util                   421094 non-null  float64
 3   total_rev_hi_lim             421094 non-null  float64
 4   term_in_months               421094 non-null  int64  
 5   loan_to_income               421094 non-null  float64
 6   dti                          421094 non-null  float64
 7   inq_last_6mths               421094 non-null  float64
 8   tot_cur_bal_missing          421094 non-null  int64  
 9   mths_since_earliest_cr_line  421094 non-null  float64
dtypes: float64(8), int64(2)
memory usage: 32.1 MB


We continue with the variables that takes numerical values.

In [43]:
dummies_term_df = pd.get_dummies(loan_df['term_in_months'], prefix = 'term_in_months', prefix_sep=':')

loan_df = pd.concat([loan_df, dummies_term_df], axis = 1)

loan_df = loan_df.drop('term_in_months', axis = 1)

In [44]:
loan_df['inq_last_6mths:0'] = np.where(loan_df.inq_last_6mths == 0, True, False)
loan_df['inq_last_6mths:1'] = np.where(loan_df.inq_last_6mths == 1., True, False)
loan_df['inq_last_6mths:2'] = np.where(loan_df.inq_last_6mths == 2., True, False)

loan_df['inq_last_6mths:3-33'] = np.where(loan_df.inq_last_6mths >= 3., True, False)


loan_df = loan_df.drop('inq_last_6mths', axis = 1)


In [45]:
continuous_features = [feat for feat in numerical_features if feat not in ['inq_last_6mths', 'term_in_months',
                                                                           'tot_cur_bal_missing']]
expected_cont_feat = []

for it in loan_Xt.columns.values.tolist():
    
    if it.split(":")[0] in continuous_features:
        
        expected_cont_feat.append(it)       

expected_cont_feat.sort()        

In [46]:
for feat in expected_cont_feat:
    
    boundaries = feat.split(':')[1].split('-')
    
    if len(boundaries) == 2:
        
        loan_df[feat] = (np.where((loan_df[feat.split(':')[0]] >= float(boundaries[0]))
                                 & (loan_df[feat.split(':')[0]] < float(boundaries[1])), True, False))
    else: 
        
        if boundaries[0].startswith('<'):
            
            lower_bound = boundaries[0].split('<')[1]
            loan_df[feat] = np.where(loan_df[feat.split(":")[0]] < float(lower_bound), True, False)
        
        elif boundaries[0].startswith('>'): 
            
            upper_bound = boundaries[0].split('>')[1]
            loan_df[feat] = np.where(loan_df[feat.split(":")[0]] >= float(upper_bound), True, False) 

In [47]:
# drop originals 

loan_df = loan_df.drop(continuous_features, axis = 1)

Finally, we are done with pre-processing the new data exactly in the same way we have done for the training set!

In [48]:
# rename 
loan_df_new_inputs = loan_df 
loan_df_new_targets = loan_df_targets

In [49]:
loan_df_new_targets.shape, loan_df_new_inputs.shape 

((421094,), (421094, 67))

To compare fully the training set and the new data, we are not only interested in calculating the PSI for the independent variables we used in the PD model but also its outcome: Credit Score! If the distribution of the credit score is changed in the new_data, it also signals that we might want to revise our model! In practice, this means that we need to score the training and the new data. For this purpose, we can use the score-card we developed earlier which we used to score the validation dataset.  

In [50]:
# import the score card 
score_card = pd.read_csv(model_data_dir + 'score_card.csv')

# add a column for intercept to both train and the new data 

loan_Xt.insert(0,'const', 1)

loan_df_new_inputs.insert(0, 'const', 1)

In [51]:
score_card

Unnamed: 0,variable_name,coefficient,score
0,const,-1.672207,327.0
1,addr_state:AR-MI-PA-OH,0.110138,18.0
2,addr_state:AZ-NJ,0.043827,7.0
3,addr_state:CA-UT-KY,0.044299,7.0
4,addr_state:GA-WA-OR-WI,0.133024,22.0
5,addr_state:IL-CT,0.176802,29.0
6,addr_state:KS-SC-CO-VT-AK-MS-WV-NH-WY-DC-ME-ID,0.248863,41.0
7,addr_state:LA-MD-NC,0.035,6.0
8,addr_state:MN-RI-MA-DE-SD-IN,0.079405,13.0
9,addr_state:MT-TX,0.144197,24.0


In [None]:
# order the columns as appear in the score card 

loan_df_new_inputs = loan_df_new_inputs[score_card.variable_name.values]
loan_Xt = loan_Xt[score_card.variable_name.values]

In [57]:
card_scores = score_card.score.values.reshape(-1,1)

train_scores = loan_Xt.dot(card_scores)
new_data_scores = loan_df_new_inputs.dot(card_scores)

train_scores.columns = ['score']
new_data_scores.columns = ['score']

# concat the scores with input dfs
loan_df_new_inputs = pd.concat([loan_df_new_inputs, new_data_scores], axis =1 )
loan_Xt = pd.concat([loan_Xt, train_scores], axis = 1)


Next we need to dummify the score columns we just created in order to be able to use them in the PSI computation. Since the range of scores vary between 300 to 850, we can use bins of size 50! 

In [64]:
np.arange(300.,850.,50.)

array([300., 350., 400., 450., 500., 550., 600., 650., 700., 750., 800.])

In [72]:
for i in np.arange(300, 850, 50):
    
  loan_df_new_inputs[f"score:{i}-{i + 50}"] = (np.where((loan_df_new_inputs['score'] >= float(f"{i}"))
                                                        & (loan_df_new_inputs['score'] < float(f"{i+50}")
                                                           ), True, False))
  loan_Xt[f"score:{i}-{i + 50}"] = (np.where((loan_Xt['score'] >= float(f"{i}"))
                                             & (loan_Xt['score'] < float(f"{i+50}")), True, False))

In [74]:
# drop the score 

loan_df_new_inputs = loan_df_new_inputs.drop('score', axis =1)

loan_Xt = loan_Xt.drop('score', axis = 1)

In [90]:
# begin building PSI calculations 

psi_train = loan_Xt.sum() / loan_Xt.shape[0]
psi_new_data = loan_df_new_inputs.sum() / loan_df_new_inputs.shape[0]


In [105]:
psi_df = pd.concat([psi_train, psi_new_data], axis = 1)

psi_df = psi_df.reset_index()

psi_df.columns = ['variable_name', 'train_pct', 'new_data_pct']

In [106]:
psi_df['original_var_name'] = psi_df['variable_name'].str.split(":").str[0]

In [107]:
# we dont need the intercept for psi calc 

psi_df = psi_df[psi_df.variable_name != 'const']

In [108]:
# re-order the columns 

columns = [col for col in psi_df.columns]

psi_df = psi_df[[columns[0], columns[3], columns[1], columns[2]]]

In [113]:
psi_df['psi'] = (np.where((psi_df.new_data_pct == 0.) | (psi_df.train_pct == 0.), 0,
                          (psi_df.new_data_pct - psi_df.train_pct) * np.log(psi_df.new_data_pct / psi_df.train_pct)))

psi_df

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,variable_name,original_var_name,train_pct,new_data_pct,psi
1,addr_state:AR-MI-PA-OH,addr_state,0.09966,0.104373,0.0002177983
2,addr_state:AZ-NJ,addr_state,0.061738,0.05912,0.0001134543
3,addr_state:CA-UT-KY,addr_state,0.170341,0.154395,0.001567229
4,addr_state:GA-WA-OR-WI,addr_state,0.080117,0.079873,7.467356e-07
5,addr_state:IL-CT,addr_state,0.055245,0.055073,5.361432e-07
6,addr_state:KS-SC-CO-VT-AK-MS-WV-NH-WY-DC-ME-ID,addr_state,0.064759,0.068303,0.0001888089
7,addr_state:LA-MD-NC,addr_state,0.062553,0.064577,6.445792e-05
8,addr_state:MN-RI-MA-DE-SD-IN,addr_state,0.064622,0.067683,0.0001416377
9,addr_state:MT-TX,addr_state,0.081123,0.085162,0.0001962426
10,addr_state:NE-IA-NV-FL-HI-AL,addr_state,0.099537,0.105186,0.0003118056


In [112]:
# psi by features 

tot_psi_df = psi_df.groupby('original_var_name')[['psi']].sum()

tot_psi_df

Unnamed: 0_level_0,psi
original_var_name,Unnamed: 1_level_1
addr_state,0.002906
annual_inc,0.004966
dti,0.051554
home_ownership,0.004274
initial_list_status,0.333717
inq_last_6mths,0.054774
loan_to_income,0.002109
mths_since_earliest_cr_line,0.017067
purpose,0.004016
revol_util,0.012541


Major change that appear in the `initial_list_status` may not directly reflect a change in the borrowers characteristics but rather a change in the bank's behavior. `score` does not seem to have too large of a `PSI` to take an action at the moment!