In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer

from torch.utils.data import TensorDataset, DataLoader

import torch
import torch.nn as nn
import torch.optim as optim

import joblib

In [2]:
# Data loading
data = pd.read_csv('../data/credit_risk_dataset.csv')

In [3]:
# Obtain the unique data for each column
unique_data = pd.Series(data.nunique(), name="Unique data")

# Obtain the missing values of each column
missing_data = pd.Series(data.isnull().mean() * 100, name="Missing data")

# Data types
data_types = pd.Series(data.dtypes, name="Data types")

# Concatenate the results
result = pd.concat([unique_data, missing_data, data_types], axis=1)

# Print the results
result

Unnamed: 0,Unique data,Missing data,Data types
person_age,58,0.0,int64
person_income,4295,0.0,int64
person_home_ownership,4,0.0,object
person_emp_length,36,2.747,float64
loan_intent,6,0.0,object
loan_grade,7,0.0,object
loan_amnt,753,0.0,int64
loan_int_rate,348,9.563856,float64
loan_status,2,0.0,int64
loan_percent_income,77,0.0,float64


In [4]:
# Split the data into X and y
X = data.drop(columns='loan_status')
y = data['loan_status']

In [5]:
# Split the data into train, validation and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Show the shape of the train, validation and test sets
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(19548, 11) (6516, 11) (6517, 11)
(19548,) (6516,) (6517,)


In [6]:
# Pipeline to preprocess the numerical features

# Select the numerical columns
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

# Create the numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),          # Impute the missing values with the median
    ('scaler', MinMaxScaler(feature_range=(0, 1))),         # Scale the data to be between 0 and 1
    ('sqrt', FunctionTransformer(np.sqrt))                  # Apply the square root to the data
])

# Pipeline to preprocess the categorical or nominal features
categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']

# Create the categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),   # Impute the missing values with the most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))      # One hot encode the data
])

# Combine the numerical and categorical pipelines
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_cols),      # Apply the numerical pipeline to the numerical columns
    ('categorical', categorical_pipeline, categorical_cols) # Apply the categorical pipeline to the categorical columns
])

In [7]:
# Fit the preprocessor
X_train_preprocessed = preprocessor.fit(X_train)

# Transform the data
X_train_preprocessed = preprocessor.transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

# Show the shape of the preprocessed data
print(X_train_preprocessed.shape, X_val_preprocessed.shape, X_test_preprocessed.shape)

(19548, 26) (6516, 26) (6517, 26)


In [8]:
# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_preprocessed, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_preprocessed, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_preprocessed, dtype=torch.float32)

# Convert the target to PyTorch tensors
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).reshape(-1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1)

In [9]:
# Mini-batch size
batch_size = 32

# Create the train, validation and test sets
train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, y_val_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)

# Create the data loaders
train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_data, batch_size=batch_size)
test_loader = DataLoader(dataset=test_data, batch_size=batch_size)

# Show the shape of the data
print(X_train_tensor.shape, '|', y_train_tensor.shape)
print(X_val_tensor.shape, '|' ,y_val_tensor.shape)
print(X_test_tensor.shape, '|' ,y_test_tensor.shape)

torch.Size([19548, 26]) | torch.Size([19548])
torch.Size([6516, 26]) | torch.Size([6516])
torch.Size([6517, 26]) | torch.Size([6517])


In [10]:
# Define the neural network
class LoanApprovalNN(nn.Module):
    def __init__(self):
        super(LoanApprovalNN, self).__init__()              # Call the constructor of the parent class
        self.fc1 = nn.Linear(X_train_tensor.shape[1], 64)   # Define the first fully connected layer
        self.fc2 = nn.Linear(64, 64)                        # Define the second fully connected layer
        self.fc3 = nn.Linear(64, 64)                        # Define the third fully connected layer
        self.fc4 = nn.Linear(64, 32)                        # Define the fourth fully connected layer
        self.output = nn.Linear(32, 1)                      # Define the output layer
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))             # Apply the ReLU activation function to the first fully connected layer
        x = torch.relu(self.fc2(x))             # Apply the ReLU activation function to the second fully connected layer
        x = torch.relu(self.fc3(x))             # Apply the ReLU activation function to the third fully connected layer
        x = torch.relu(self.fc4(x))             # Apply the ReLU activation function to the fourth fully connected layer  
        x = torch.sigmoid(self.output(x))       # Apply the sigmoid activation function to the output layer
        return x

# Instantiate the model
model = LoanApprovalNN()


In [11]:
# Define the loss function and the optimizer
criterion = nn.BCELoss()                                # Binary Cross Entropy
optimizer = optim.Adam(model.parameters(), lr=0.005)    # Adam optimizer


In [12]:
# Function to train the model
def train(model, train_loader, valid_loader, criterion, optimizer, epochs=50):
    for epoch in range(epochs):
        model.train()                                                       # Set the model to training mode
        running_loss = 0.0                                                  # Initialize the running loss

        for inputs, labels in train_loader:             
            optimizer.zero_grad()                                           # Clear the gradients
            outputs = model(inputs)                                         # Forward pass
            loss = criterion(outputs.squeeze(), labels)                     # Compute the loss
            loss.backward()                                                 # Backward pass
            optimizer.step()                                                # Update the weights

            running_loss += loss.item()                                     # Accumulate the loss

        # Validation loss               
        model.eval()                                                        # Set the model to evaluation mode
        valid_loss = 0.0                                                    # Initialize the validation loss
        with torch.no_grad():                                               # No gradients in validation
            for inputs, labels in valid_loader:                             # Iterate over the validation loader
                outputs = model(inputs)                                     # Forward pass
                valid_loss += criterion(outputs.squeeze(), labels).item()   # Compute the loss

        print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {valid_loss/len(valid_loader):.4f}')

# Train the model for 50 epochs and evaluate it on the validation set
train(model, train_loader, val_loader, criterion, optimizer, epochs=50)


Epoch 1/50, Loss: 0.3517, Validation Loss: 0.3157
Epoch 2/50, Loss: 0.3032, Validation Loss: 0.2849
Epoch 3/50, Loss: 0.2848, Validation Loss: 0.2715
Epoch 4/50, Loss: 0.2717, Validation Loss: 0.2697
Epoch 5/50, Loss: 0.2671, Validation Loss: 0.2627
Epoch 6/50, Loss: 0.2571, Validation Loss: 0.2561
Epoch 7/50, Loss: 0.2512, Validation Loss: 0.2484
Epoch 8/50, Loss: 0.2460, Validation Loss: 0.2572
Epoch 9/50, Loss: 0.2467, Validation Loss: 0.2458
Epoch 10/50, Loss: 0.2455, Validation Loss: 0.2700
Epoch 11/50, Loss: 0.2414, Validation Loss: 0.2416
Epoch 12/50, Loss: 0.2385, Validation Loss: 0.2628
Epoch 13/50, Loss: 0.2400, Validation Loss: 0.2805
Epoch 14/50, Loss: 0.2354, Validation Loss: 0.2472
Epoch 15/50, Loss: 0.2367, Validation Loss: 0.2466
Epoch 16/50, Loss: 0.2351, Validation Loss: 0.2381
Epoch 17/50, Loss: 0.2341, Validation Loss: 0.2379
Epoch 18/50, Loss: 0.2324, Validation Loss: 0.2385
Epoch 19/50, Loss: 0.2313, Validation Loss: 0.2434
Epoch 20/50, Loss: 0.2300, Validation Lo

In [13]:
# Function to evaluate the model
def evaluate(model, test_loader, criterion):                    # Function to evaluate the model
    model.eval()                                                # Set the model to evaluation mode                 
    test_loss = 0.0                                             # Initialize the test loss                       
    correct = 0                                                 # Initialize the number of correct predictions
    total = 0                                                   # Initialize the total number of predictions

    with torch.no_grad():                                       # No gradients in evaluation
        for inputs, labels in test_loader:                      # Iterate over the test loader
            outputs = model(inputs).squeeze()                   # Forward pass
            loss = criterion(outputs, labels)                   # Compute the loss
            test_loss += loss.item()                            # Accumulate the loss
            predicted = (outputs > 0.5).float()                 # Convert the probabilities to binary predictions
            total += labels.size(0)                             # Accumulate the number of predictions
            correct += (predicted == labels).sum().item()       # Accumulate the number of correct predictions

    accuracy = correct / total                                  # Compute the accuracy

    # Print the test loss and accuracy
    print(f'Test Loss: {test_loss/len(test_loader):.4f}, Test Accuracy: {accuracy:.4f}')

# Evaluate the model on the test set
evaluate(model, test_loader, criterion)

Test Loss: 0.2536, Test Accuracy: 0.9179


In [15]:
# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.pkl')

# Save the model
torch.save(model.state_dict(), 'model.pth')