In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

### 5. Prepare Testing data

In [21]:
df_test = pd.read_csv('testing_loan_data.csv',
                      converters={'id': str, 
                                  'member_id': str})    
df_test.head()

  df_test = pd.read_csv('testing_loan_data.csv',


Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,20000001,22419852,10000,36 months,22.15%,8 years,RENT,37000.0,,debt_consolidation,...,1,3.0,73.10%,16200,,14877.17028,36809,1,131,
1,20000002,22349118,1400,36 months,18.24%,6 years,RENT,41000.0,,other,...,0,9.0,11.50%,4000,,4097.30477,19536,1,19,
2,20000003,22398818,7000,36 months,12.49%,3 years,RENT,68900.0,,debt_consolidation,...,0,11.0,48.10%,11900,80.0,12688.49516,241465,1,92,
3,20000004,22419015,18000,60 months,16.29%,9 years,MORTGAGE,41000.0,,debt_consolidation,...,1,0.0,38.10%,7600,73.0,7908.799817,179757,1,235,
4,20000005,22388614,12000,36 months,12.99%,10+ years,MORTGAGE,64000.0,,home_improvement,...,0,,57.90%,21000,,19378.56106,31953,1,157,


In [22]:
df_test.describe()

Unnamed: 0,loan_amnt,annual_inc,percent_bc_gt_75,bc_util,dti,inq_last_6mths,mths_since_recent_inq,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
count,102505.0,102505.0,101459.0,101463.0,102505.0,102505.0,93677.0,102505.0,29146.0,102505.0,102505.0,102505.0,102505.0,0.0
mean,14855.131457,75189.59,50.648124,64.598112,17.437458,0.848963,7.049062,19300.244573,43.248199,19305.304664,139037.5,1.0,194.151651,
std,8390.030386,55024.54,34.493123,26.130614,7.54479,1.093399,5.872614,19265.526038,22.223284,19332.611174,149940.8,0.0,109.059728,
min,1000.0,3000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,14.0,
25%,8400.0,46000.0,25.0,46.1,11.77,0.0,2.0,6800.0,26.0,6737.412077,28635.0,1.0,110.0,
50%,13000.0,65000.0,50.0,68.9,17.16,1.0,6.0,13300.0,43.0,13329.72295,83006.0,1.0,170.0,
75%,20000.0,90000.0,80.0,87.0,23.0,1.0,11.0,25200.0,59.0,25175.52937,209941.0,1.0,261.0,
max,35000.0,7446395.0,100.0,183.6,34.99,6.0,25.0,560800.0,188.0,553280.9974,3370799.0,1.0,456.0,


In [23]:
# fix "term"
df_test['term_nb'] = np.where(df_test['term'].notnull() & ~df_test['term'].isna(), 
                         pd.to_numeric(df_test['term'].str[0:3], errors='coerce'), 
                         np.nan
                        )

# fix "int_rate", "revol_util"
df_test['int_rate_nb'] = df_test['int_rate'].str.rstrip('%').astype('float') / 100.0
df_test['revol_util_nb'] = df_test['revol_util'].str.rstrip('%').astype('float') / 100.0
df_test['int_rate_nb'].dtype, df_test['int_rate_nb'][0: 5], df_test['revol_util_nb'].dtype, df_test['revol_util_nb'][0: 5]

# Fix utilization rate
df_test['bc_util_fixed'] = np.where(df_test['bc_util'] > 100, 100, df_test['bc_util'])
df_test['revol_util_nb_fixed'] = np.where(df_test['revol_util_nb'] > 1, 100, df_test['bc_util'] * 100)

# Fill Missing values in "emp_length" with 'NONE' as a string
df_test['emp_length'] = df_test['emp_length'].fillna('NONE')
# Fill missing values of numeric variables with their corresponding mean
c_lst = ['percent_bc_gt_75', 'bc_util_fixed', 'mths_since_recent_inq', 'total_bc_limit', 'mths_since_last_major_derog', 'tot_hi_cred_lim', 'tot_cur_bal', 'revol_util_nb_fixed']
## create a new dataframe to house the cleaned data
carry_cols = [c for c in df_test.columns if (c not in c_lst) & (c not in ['bc_util', 'revol_util_nb'])]
df_test_clean = df_test[carry_cols].copy()

df_dedup_2 = pd.read_csv('df_dedup_2.csv', converters={'id': str, 
                                                         'member_id': str})    
for c in c_lst:
    print(f"Missing values in '{c}' is imputed by its mean.")
    df_test_clean[c] = df_test[c].fillna(df_test[c].mean())

# Encode categorical variables
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

categorical_vars = ['emp_length', 'home_ownership', 'purpose']
# Encode the categorical variable
for v in categorical_vars:
    print(f"--- Encoding {v} ---")
    df_test_clean[v + '_encoded'] = label_encoder.fit_transform(df_test_clean[v])
    print(df_test_clean[v + '_encoded'].value_counts())

# create interactive variables
df_test_clean['bc_util_fixed_X_revol_util_nb_fixed'] = df_test_clean['bc_util_fixed'] * df_test_clean['revol_util_nb_fixed']

Missing values in 'percent_bc_gt_75' is imputed by its mean.
Missing values in 'bc_util_fixed' is imputed by its mean.
Missing values in 'mths_since_recent_inq' is imputed by its mean.
Missing values in 'total_bc_limit' is imputed by its mean.
Missing values in 'mths_since_last_major_derog' is imputed by its mean.
Missing values in 'tot_hi_cred_lim' is imputed by its mean.
Missing values in 'tot_cur_bal' is imputed by its mean.
Missing values in 'revol_util_nb_fixed' is imputed by its mean.
--- Encoding emp_length ---
emp_length_encoded
1     34896
2      8686
3      7912
10     7435
0      6102
5      6050
7      5899
4      5613
6      5411
11     5321
8      5109
9      4071
Name: count, dtype: int64
--- Encoding home_ownership ---
home_ownership_encoded
0    53264
2    39530
1     9711
Name: count, dtype: int64
--- Encoding purpose ---
purpose_encoded
2     60626
1     25694
3      6032
8      4208
5      1664
10     1094
6      1052
0       783
11      504
7       477
4       329


In [24]:
var = ['loan_amnt', 'annual_inc', 'dti', 'term_nb', 'percent_bc_gt_75', 'mths_since_recent_inq', 'total_bc_limit', 'tot_hi_cred_lim', 'tot_cur_bal', 'revol_util_nb_fixed', 'purpose_encoded', 'bc_util_fixed_X_revol_util_nb_fixed']
df_test_ready = df_test_clean[var].copy()

In [25]:
# convert df to tensor
df_test_ready_tensor = torch.from_numpy(df_test_ready.values).type(torch.float32)

### 5. Load Model

In [31]:

device = "cuda" if torch.cuda.is_available() else "cpu"

in_f = df_test_ready.shape[1]

class Net(nn.Module):
    def __init__(self, in_f):
        super().__init__()
        self.layer_1 = nn.Linear(in_features=in_f, out_features=100)
        self.layer_2 = nn.Linear(in_features=100, out_features=1)
        # self.layer_3 = nn.Linear(in_features=200, out_features=1)
        # self.layer_4 = nn.Linear(in_features=400, out_features=1)
        self.relu = nn.ReLU() # <- add in ReLU activation function
        # Can also put sigmoid in the model 
        # This would mean you don't need to use it on the predictions
        # self.sigmoid = nn.Sigmoid()

    def forward(self, x):
      # Intersperse the ReLU activation function between layers
       return self.relu(self.layer_2(self.relu(self.layer_1(x))))

LoanModel = Net(in_f).to(device)


LoanModel.load_state_dict(torch.load('trained_loan_model_v0122.pth'))


  LoanModel.load_state_dict(torch.load('trained_loan_model_v0122.pth'))


<All keys matched successfully>

In [32]:
# Make inference
LoanModel.eval()
with torch.inference_mode():
    test_logits = LoanModel(df_test_ready_tensor).squeeze() 
    test_pred = torch.round(torch.sigmoid(test_logits))

In [33]:
test_pred.sum()

tensor(56334.)

In [34]:
df_test_pred = pd.DataFrame(test_pred.unsqueeze(1).numpy())

In [35]:
df_test_pred.to_csv('test_result_turnin.csv', index=True)