In [368]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import catboost as cb
from catboost import CatBoostClassifier, Pool
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
import pandas as pd
import numpy as np
import sklearn 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import scipy.sparse as sp
import torch.optim as optim
from sklearn.metrics import roc_curve, auc
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import BaggingClassifier
import shap
%matplotlib inline

In [369]:
df1 = pd.read_csv('Lending_Club_modelling_data_part1.txt', delimiter='\t')
df2 = pd.read_csv('Lending_Club_modelling_data_part2.txt', delimiter='\t', dtype={'column_name': str}, low_memory=False)
df3 = pd.read_csv('Lending_Club_modelling_data_part3.txt', delimiter='\t')
df4 = pd.read_csv('Lending_Club_modelling_data_part4.txt', delimiter='\t')

#merged data
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
print("Before dropping na: ",df.shape)

# Drop the title column because it is the same information as purpose
df = df.drop('title', axis=1)

# Fill missing values in 'address' column with the wrong address in mort_acc column
df['address'] = df['address'].fillna( '76093 Nicole Parks\r\nEast Donaldfurt, IN 70466')

# Drop application_type, pub_rec, address, initial_list_status, pub_rec_bankruptcies columns since they are useless after checking catboost feature importance
df = df.drop(['application_type', 'pub_rec', 'address', 'initial_list_status', 'pub_rec_bankruptcies'], axis=1)

# Replace missing values in 'emp_title' column with 'Unknown'
df['emp_title'] = df['emp_title'].fillna('Unknown')

#print rows with missing values, show only if different from 0
print(df.isnull().sum()[df.isnull().sum() > 0])

#replace target variable values with 0 and 1
df['loan_status'] = df['loan_status'].replace({'Charged Off': 1, 'Fully Paid': 0})

# Convert 'issue_d' column to datetime format
df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y')

# Replace values in 'emp_length' column with numerical values
emp_length_map = {
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5,
    '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, '10+ years': 10
}
df['emp_length'] = df['emp_length'].replace(emp_length_map)

# Correct types of columns
df['revol_util'] = df['revol_util'].replace({'f':'NaN'})
df['revol_util'] = df['revol_util'].astype(float)
df['total_acc'] = df['revol_util'].replace({'INDIVIDUAL':'NaN'})
df['total_acc'] = df['total_acc'].astype(float)
df['mort_acc'] = df['mort_acc'].replace({'76093 Nicole Parks\r\nEast Donaldfurt, IN 70466':'NaN'})
df['mort_acc'] = df['mort_acc'].astype(float)

Before dropping na:  (396030, 25)
emp_length    18301
revol_util      276
mort_acc      37795
dtype: int64


# Net Profit function to be used to compute the loss
# Problem: All my values are scaled

In [370]:
def net_profit_for_custom_loss(y_pred, batch):
    # Unpack the batch into input (X) and labels (y)
    X, y = batch

    # Convert X and y to float tensors (if necessary)
    X = X.float()
    y = y.float()

    # Extract relevant columns from the input (X) tensor
    loan_amnt = X[:, 0]   # Assuming the first column is loan amount
    int_rate = X[:, 2]    # Assuming the third column is interest rate
    loan_status = y       # Assuming y represents loan status (0 or 1)

    # Who received loans with the model (assuming y_pred is a tensor)
    index = (y_pred == 0)

    # Who paid back the loans (loan_status == 0)
    index_payback = (loan_status == 0)

    # Who defaulted on the loans (loan_status == 1)
    index_default = (loan_status == 1)

    # Calculate profit for loans that were paid back
    paid_back_loans = index & index_payback
    profit_paid_back = loan_amnt[paid_back_loans] * (int_rate[paid_back_loans] / 100)
    total_profit = torch.sum(profit_paid_back)

    # Calculate loss for loans that defaulted
    defaulted_loans = index & index_default
    loss_defaulted = loan_amnt[defaulted_loans] * 0.5
    total_loss = torch.sum(loss_defaulted)

    # Calculate net profit (custom loss)
    net_profit = total_profit - total_loss

    return net_profit

# Net Profit function

In [371]:
# Define the net_profit function 
def net_profit(y_pred, df):
    # Extract relevant columns from DataFrame as NumPy arrays
    loan_amnt = df['loan_amnt'].values
    int_rate = df['int_rate'].values
    loan_status = df['loan_status'].values
    
    # Who received loans with the model
    index = (y_pred == 0)
    # Who paid back the loans
    index_payback = (loan_status == 0)
    # Who defaulted on the loans
    index_default = (loan_status == 1)
    
    # Calculate profit for loans that were paid back
    paid_back_loans = index & index_payback
    profit_paid_back = loan_amnt[paid_back_loans] * (int_rate[paid_back_loans] / 100)
    total_profit = np.sum(profit_paid_back)
    
    # Calculate loss for loans that defaulted
    defaulted_loans = index & index_default
    loss_defaulted = loan_amnt[defaulted_loans] * 0.5
    total_loss = np.sum(loss_defaulted)
    
    return total_profit - total_loss

# second net profit function with installments

In [372]:
# Define the net_profit function 
def net_profit2(y_pred, df):
    # Extract relevant columns from DataFrame as NumPy arrays
    loan_amnt = df['loan_amnt'].values
    installment = df['installment'].values
    term = df['term'].replace({-1: 36, 1: 60}).values
    loan_status = df['loan_status'].values
    
    # Who received loans with the model
    index = (y_pred == 0)
    # Who paid back the loans
    index_payback = (loan_status == 0)
    # Who defaulted on the loans
    index_default = (loan_status == 1)
    
    # Calculate profit for loans that were paid back
    paid_back_loans = index & index_payback
    profit_paid_back = installment[paid_back_loans]*term[paid_back_loans]-loan_amnt[paid_back_loans]
    total_profit = np.sum(profit_paid_back)
    
    # Calculate loss for loans that defaulted
    defaulted_loans = index & index_default
    loss_defaulted = loan_amnt[defaulted_loans] * 0.5 #paid back %50 of the loan before defalting (assumption)
    total_loss = np.sum(loss_defaulted)
    
    return total_profit - total_loss
    #return total_profit,total_loss,total_profit - total_loss

# Custom Loss
I dont use it


In [373]:
# Define a custom loss function based on net_profit
class CustomNetProfitLoss(nn.Module):
    def __init__(self):
        super(CustomNetProfitLoss, self).__init__()

    def net_profit_for_custom_loss(self, y_pred, batch):
        # Unpack the batch into input (X) and labels (y)
        X, y = batch

        # Convert X and y to float tensors (if necessary)
        X = X.float()
        y = y.float()

        # Extract relevant columns from the input (X) tensor
        loan_amnt = X[:, 0]   # Assuming the first column is loan amount
        int_rate = X[:, 2]    # Assuming the third column is interest rate
        loan_status = y       # Assuming y represents loan status (0 or 1)

        # Who received loans with the model (assuming y_pred is a tensor)
        index = (y_pred == 0)

        # Who paid back the loans (loan_status == 0)
        index_payback = (loan_status == 0)

        # Who defaulted on the loans (loan_status == 1)
        index_default = (loan_status == 1)

        # Calculate profit for loans that were paid back
        paid_back_loans = index & index_payback
        profit_paid_back = loan_amnt[paid_back_loans] * (int_rate[paid_back_loans] / 100)
        total_profit = torch.sum(profit_paid_back)

        # Calculate loss for loans that defaulted
        defaulted_loans = index & index_default
        loss_defaulted = loan_amnt[defaulted_loans] * 0.5
        total_loss = torch.sum(loss_defaulted)

        # Calculate net profit (custom loss)
        net_profit = total_profit - total_loss

        return net_profit

    def forward(self, y_pred, batch):
        # Compute net profit based on predicted labels and loan information
        net_profit = self.net_profit_for_custom_loss(y_pred, batch)
        
        # Return negative net profit as loss (minimize loss to maximize net profit)
        return -net_profit

# Preprocessing

In [374]:
#FOR TERM (36-60 MONTHS) WE ENCODE THEM AS 0-1 AND INCLUDE THEM IN THE NUMERICAL COLUMNS
df['term'] = df['term'].str.replace(' months', '').astype(int)
df['term'] = df['term'].replace({ 36: -1, 60: 1})

#For verification status we use one hot encoding
df = pd.get_dummies(df, columns=['verification_status'], sparse=True)
df['verification_status_Not Verified'] = df['verification_status_Not Verified'].astype(int)
#df['verification_status_Source Verified'] = df['verification_status_Source Verified'].astype(int) 
df['verification_status_Verified'] = df['verification_status_Verified'].astype(int)
df.drop('verification_status_Source Verified', axis=1, inplace=True) #drop this column as it is redundant (control)

#For home ownership we use one hot encoding
df = pd.get_dummies(df, columns=['home_ownership'], sparse=True)
df['home_ownership_MORTGAGE'] = df['home_ownership_MORTGAGE'].astype(int)
df['home_ownership_OWN'] = df['home_ownership_OWN'].astype(int)
df['home_ownership_RENT'] = df['home_ownership_RENT'].astype(int)
df.drop('home_ownership_ANY', axis=1, inplace=True) #drop this column for control
df.drop('home_ownership_NONE', axis=1, inplace=True) #drop this column for control
df.drop('home_ownership_OTHER', axis=1, inplace=True) #drop this column for control

Since this is a neural network model, we drop the categorical columns in the beginning. 

In [375]:
categorical_columns = df.select_dtypes(include='object').columns.tolist()
df.drop(categorical_columns, axis=1, inplace=True)


# -----------------------------------------------------

# Feature Engineering

In [376]:
df_original=df.copy()
# Separate the data into three groups based on the year ranges
original_training_set = df_original[df_original['issue_d'].dt.year < 2015]
original_validation_set = df_original[df_original['issue_d'].dt.year == 2015]
original_test_set = df_original[df_original['issue_d'].dt.year == 2016]

# Optionally, you may want to reset the index for each subset
original_training_set.reset_index(drop=True, inplace=True)
original_validation_set.reset_index(drop=True, inplace=True)
original_test_set.reset_index(drop=True, inplace=True)

In [377]:
df['annual_inc']=df['annual_inc']/12 #convert annual income to monthly income
df['installment_over_income'] = df['installment'] / df['annual_inc']
df['installment_over_income'] = df['installment_over_income'].replace([np.inf, -np.inf], np.nan)
df.drop('installment', axis=1, inplace=True) #drop this column as it is redundant
df.drop('int_rate', axis=1, inplace=True) #redundant
df.drop('loan_amnt', axis=1, inplace=True)
df.drop('annual_inc', axis=1, inplace=True) #drop this column as it is redundant


# -----------------------------------------------------

In [378]:
# Convert 'issue_d' column to datetime format
df['issue_d'] = pd.to_datetime(df['issue_d'], format='%b-%Y')

# Separate the data into three groups based on the year ranges
training_set = df[df['issue_d'].dt.year < 2015]
validation_set = df[df['issue_d'].dt.year == 2015]
test_set = df[df['issue_d'].dt.year == 2016]

# Optionally, you may want to reset the index for each subset
training_set.reset_index(drop=True, inplace=True)
validation_set.reset_index(drop=True, inplace=True)
test_set.reset_index(drop=True, inplace=True)

# Print the number of rows in each group
print("Number of rows in training set:", len(training_set))
print("Number of rows in validation set:", len(validation_set))
print("Number of rows in test set:", len(test_set))

Number of rows in training set: 273678
Number of rows in validation set: 94264
Number of rows in test set: 28088


In [379]:
categorical_columns = training_set.select_dtypes(include='object').columns.tolist()
print(categorical_columns)
numerical_columns = training_set.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(numerical_columns)
print(len(numerical_columns)+len(categorical_columns))

[]
['term', 'emp_length', 'loan_status', 'dti', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'mort_acc', 'verification_status_Not Verified', 'verification_status_Verified', 'home_ownership_MORTGAGE', 'home_ownership_OWN', 'home_ownership_RENT', 'installment_over_income']
15


imputer and scaler


In [380]:
recover_original_training_set=training_set.loc[:,numerical_columns].copy()
num_training_set = training_set.loc[:,numerical_columns].drop('loan_status', axis=1)
numerical_columns_without_loan_status= num_training_set.columns
imputer = IterativeImputer(random_state=0)
imputer.fit(num_training_set)

imputed_array = imputer.transform(num_training_set[numerical_columns_without_loan_status])
num_training_set = pd.DataFrame(imputed_array, columns=numerical_columns_without_loan_status)

# Initialize scalers
robust_scaler = RobustScaler()

# Fit and transform the numerical features
num_training_set = robust_scaler.fit_transform(num_training_set)

# Concatenate imputed features with target variable
training_set.loc[:,numerical_columns_without_loan_status] = num_training_set

# Check for missing values in the training set
print("Missing values in the training set:")
print(training_set.isnull().sum()[training_set.isnull().sum() > 0])

Missing values in the training set:
Series([], dtype: int64)


preprocessing is over, now we have a data set with only numerical features wiht some categorical features one hot encoded. 


# Initialize the neural network

In [381]:

class PD_NN(nn.Module):
    def __init__(self, input_size, hidden_size1=64):
        super(PD_NN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, 1)


    def forward(self, x):
        x=F.relu(self.fc1(x)) #works much better than relu (here i can also put relu but second has to be tanh)
        x=F.tanh(self.fc2(x))
        x=F.sigmoid(x)  # Add sigmoid activation function for output as probability
        return x

In [382]:
# Drop issue date column from the training set because it is of type object and we cant process them 
training_set = training_set.drop('issue_d', axis=1) 

In [383]:
model = PD_NN(input_size=len(numerical_columns_without_loan_status),
              hidden_size1=128 
              )

explainer = shap.DeepExplainer(model, torch.tensor(training_set.drop('loan_status',axis=1).values).float())

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.1)

batch_size = 128 #maybe 32 instead
num_epochs = 10

# Apply SMOTE only to the training data
smote = SMOTE(sampling_strategy = 1 , random_state=42)
X_smoted, y_smoted = smote.fit_resample(training_set.drop('loan_status', axis =1), training_set['loan_status'])

# WITHOUT SMOTE
#X_smoted,y_smoted = training_set.drop('loan_status', axis =1), training_set['loan_status']

# Define the train_dataset 
train_dataset = TensorDataset(torch.Tensor(X_smoted.values), torch.LongTensor(y_smoted.values))

# Create the train_loader and test_loader
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)


In [384]:
# shap_values = explainer.shap_values(torch.tensor(training_set.drop('loan_status',axis=1).values).float())
# ran for 33 min without any result

In [385]:
torch.random.manual_seed(0)

<torch._C.Generator at 0x1559b2cb0>

# Training


In [386]:

# Define class weights
weights = torch.tensor([1]) # we dont use this and instead only SMOTE

# Assuming you're using BCELoss
criterion = nn.BCELoss(weight = weights)  # Using BCEWithLogitsLoss for numerical stability


criterion3 = nn.MSELoss()

In [387]:
for epoch in range(num_epochs):

    model.train()
    
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()

        X,y=batch
        X.float()
        y = y.float()
        y_pred = model(X).squeeze()
        y_pred_binary = (y_pred > 0.5).float()
        #----------------------------------------------- comment in if you want to use the original loss for the NN
        # original loss for the NN
        loss = criterion(y_pred, y )
        #----------------------------------------------- comment in if you want to use the MSE loss for the profit
        #----------------------------------------------- if you do so, need to stop the scaling and smote
        # MSE loss of the profit vs maximum possible profit
        # profit=net_profit_for_custom_loss(y_pred_binary, batch)
        # max_profit=net_profit_for_custom_loss(y, batch)
        # loss = criterion3(profit, max_profit)
        # loss.requires_grad=True


        
        loss.backward()
        optimizer.step()

    if epoch % 1 == 0:
        print(f'Epoch {epoch+1}, Loss {loss.item()}')


Epoch 1, Loss 0.6834253668785095
Epoch 2, Loss 0.6844149231910706
Epoch 3, Loss 0.6777689456939697
Epoch 4, Loss 0.6856172680854797
Epoch 5, Loss 0.6841753721237183
Epoch 6, Loss 0.6774819493293762
Epoch 7, Loss 0.6808236837387085
Epoch 8, Loss 0.6832053661346436
Epoch 9, Loss 0.6830116510391235
Epoch 10, Loss 0.6819921731948853


# Training Set

In [388]:
X_train,y_train = training_set.drop('loan_status', axis =1), training_set['loan_status']

X_train=torch.tensor(X_train.values, dtype=torch.float32)

# Predict probabilities for the test set
with torch.no_grad():
    y_prob_tensor = model(X_train)
    y_prob_numpy = y_prob_tensor.numpy()

# Convert probabilities to predicted class labels (0 or 1) based on a threshold (e.g., 0.5)
y_pred_train = (y_prob_numpy > 0.5).astype(int).flatten()


# Assuming 'y' and 'y_pred' are the ground truth labels and predicted labels, respectively
# Calculate confusion matrix
cm = confusion_matrix(y_train, y_pred_train)

TN, FP, FN, TP = cm.ravel()
print("Training set:")
true_negative_rate = TN / (TN + FP)
true_positive_rate = TP / (TP + FN)
print("True Negative Rate (non default as non default):", true_negative_rate)
print("True Positive Rate (default as default):", true_positive_rate)

# Print accuracy and classification report
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_train, y_pred_train))

Training set:
True Negative Rate (non default as non default): 0.6393312601381931
True Positive Rate (default as default): 0.6046286031042128
Accuracy: 0.6329262856349432
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.64      0.74    223166
           1       0.28      0.60      0.38     50512

    accuracy                           0.63    273678
   macro avg       0.58      0.62      0.56    273678
weighted avg       0.77      0.63      0.67    273678



In [389]:
net_val_profit_model=net_profit(y_pred_train,original_training_set)
print("Net profit:", net_val_profit_model)

# Create vector of 0s (i.e. everyone receievd the loan in the training set) of size equal to the number of rows in the training set 
zeros=np.zeros(len(original_training_set))
net_val_profit=net_profit(zeros,original_training_set)
print("Net profit original:", net_val_profit)

improved_profit=net_val_profit_model-net_val_profit
print("Improved profit:", improved_profit)

Net profit: 83506689.22999999
Net profit original: 33980007.84500003
Improved profit: 49526681.38499996


# Validation set

In [390]:
recover_original_validation_set = validation_set.copy()
num_validation_set = validation_set.loc[:,numerical_columns].drop('loan_status', axis=1)
numerical_columns_without_loan_status= num_validation_set.columns

imputed_array = imputer.transform(num_validation_set[numerical_columns_without_loan_status])
num_validation_set = pd.DataFrame(imputed_array, columns=numerical_columns_without_loan_status)

# Initialize scalers
robust_scaler = RobustScaler()

# Fit and transform the numerical features
num_validation_set_scaled = robust_scaler.fit_transform(num_validation_set)

# Concatenate imputed features with target variable
validation_set.loc[:,numerical_columns_without_loan_status] = num_validation_set_scaled

# Check for missing values in the training set
print("Missing values in the validation set:")
print(validation_set.isnull().sum()[validation_set.isnull().sum() > 0])

Missing values in the validation set:
Series([], dtype: int64)


In [391]:
# Drop issue date column from the validation set because it is of type object and we cant process them 
validation_set = validation_set.drop('issue_d', axis=1) 

In [392]:
X_validation,y_validation = validation_set.drop('loan_status', axis=1), validation_set['loan_status']

X_validation=torch.tensor(X_validation.values, dtype=torch.float32)

# Predict probabilities for the test set
with torch.no_grad():
    y_prob_tensor = model(X_validation)
    y_prob_numpy = y_prob_tensor.numpy()

# Convert probabilities to predicted class labels (0 or 1) based on a threshold (e.g., 0.5)
y_pred_validation = (y_prob_numpy > 0.5).astype(int).flatten()


# Assuming 'y' and 'y_pred' are the ground truth labels and predicted labels, respectively
# Calculate confusion matrix
cm = confusion_matrix(y_validation, y_pred_validation)

TN, FP, FN, TP = cm.ravel()
print("Validation set:")
true_negative_rate = TN / (TN + FP)
true_positive_rate = TP / (TP + FN)
print("True Negative Rate (non default as non default):", true_negative_rate)
print("True Positive Rate (default as default):", true_positive_rate)

# Print accuracy and classification report
accuracy = accuracy_score(y_validation, y_pred_validation)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_validation, y_pred_validation))

Validation set:
True Negative Rate (non default as non default): 0.7179867497280727
True Positive Rate (default as default): 0.5351680654368849
Accuracy: 0.6724624458966307
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.72      0.77     70791
           1       0.39      0.54      0.45     23473

    accuracy                           0.67     94264
   macro avg       0.60      0.63      0.61     94264
weighted avg       0.71      0.67      0.69     94264



In [393]:
# net profit of the model
net_val_profit_model=net_profit(y_pred_validation,original_validation_set)
print("Net profit:", net_val_profit_model)

# Create vector of 0s (i.e. everyone receievd the loan in the training set) of size equal to the number of rows in the training set 
zeros=np.zeros(len(original_validation_set))
net_val_profit=net_profit(zeros,original_validation_set)
print("Net profit original:", net_val_profit)

improved_profit=net_val_profit_model-net_val_profit
print("Improved profit:", improved_profit)

Net profit: 5912333.892499983
Net profit original: -48272015.629999995
Improved profit: 54184349.52249998


# TEST SET

In [394]:
recover_original_test_set = test_set.copy()
num_test_set = test_set.loc[:,numerical_columns].drop('loan_status', axis=1)
numerical_columns_without_loan_status= num_test_set.columns

imputed_array = imputer.transform(num_test_set[numerical_columns_without_loan_status])
num_test_set = pd.DataFrame(imputed_array, columns=numerical_columns_without_loan_status)

# Initialize scalers
robust_scaler = RobustScaler()

# Fit and transform the numerical features
num_test_set_scaled = robust_scaler.fit_transform(num_test_set)

# Concatenate imputed features with target variable
test_set.loc[:,numerical_columns_without_loan_status] = num_test_set_scaled

# Check for missing values in the training set
print("Missing values in the test set:")
print(test_set.isnull().sum()[test_set.isnull().sum() > 0])

Missing values in the test set:
Series([], dtype: int64)


In [395]:
# Drop issue date column from the test set because it is of type object and we cant process them 
test_set = test_set.drop('issue_d', axis=1) 

In [396]:
X_test,y_test = test_set.drop('loan_status', axis=1), test_set['loan_status']

X_test=torch.tensor(X_test.values, dtype=torch.float32)

# Predict probabilities for the test set
with torch.no_grad():
    y_prob_tensor = model(X_test)
    y_prob_numpy = y_prob_tensor.numpy()

# Convert probabilities to predicted class labels (0 or 1) based on a threshold (e.g., 0.5)
y_pred_test = (y_prob_numpy > 0.5).astype(int).flatten()


# Assuming 'y' and 'y_pred' are the ground truth labels and predicted labels, respectively
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

TN, FP, FN, TP = cm.ravel()

print("Test set:")
true_negative_rate = TN / (TN + FP)
true_positive_rate = TP / (TP + FN)
print("True Negative Rate (non default as non default):", true_negative_rate)
print("True Positive Rate (default as default):", true_positive_rate)

# Print accuracy and classification report
accuracy = accuracy_score(y_test, y_pred_test)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred_test))

Test set:
True Negative Rate (non default as non default): 0.6253688524590164
True Positive Rate (default as default): 0.6119848156182213
Accuracy: 0.6236115066932498
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.63      0.74     24400
           1       0.20      0.61      0.30      3688

    accuracy                           0.62     28088
   macro avg       0.56      0.62      0.52     28088
weighted avg       0.82      0.62      0.68     28088



improved profits in the test set

In [397]:
# net profit of the model
net_val_profit_model=net_profit(y_pred_test,original_test_set)
print("Net profit:", net_val_profit_model)

# Create vector of 0s (i.e. everyone receievd the loan in the training set) of size equal to the number of rows in the training set 
zeros=np.zeros(len(original_test_set))
net_val_profit=net_profit(zeros,original_test_set)
print("Net profit original:", net_val_profit)

improved_profit=net_val_profit_model-net_val_profit
print("Improved profit:", improved_profit)

Net profit: 11815355.427500002
Net profit original: 20667631.3575
Improved profit: -8852275.93
