In [3]:
# Import required packages

# Data packages
import pandas as pd
import numpy as np

#

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier


# Visualization Packages
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

### Model

In [4]:
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
import torch


class Block(pl.LightningModule):
    def __init__(self, input_size, hidden_units, dropout=0.2, activation=F.relu):
        super(Block, self).__init__()
        self.layer = nn.Linear(input_size, hidden_units)
        self.drop = nn.Dropout(dropout)
        # self.batchNorm = nn.BatchNorm1d(hidden_units)
        self.activation = activation

    def forward(self, x):
        x = self.layer(x)
        # x = self.batchNorm(x)
        x = self.drop(x)
        x = self.activation(x)
        return x


class LDPModel(pl.LightningModule):
    def __init__(self, num_features, num_classes, hidden_units):
        super(LDPModel, self).__init__()
        self.example_input_array = torch.Tensor(32, num_features)
        all_layers = []
        for hidden_unit in hidden_units:
            all_layers.append(Block(input_size=num_features, hidden_units=hidden_unit))
            num_features = hidden_unit
        all_layers.append(nn.Linear(hidden_units[-1], num_classes))
        self.layers = nn.Sequential(*all_layers)

    def forward(self, x):
        x = self.layers(x)
        return F.sigmoid(x)


# Lightning Module

In [6]:
import torch
from torchmetrics import Accuracy, MeanMetric, AUROC
import pytorch_lightning as pl

import torch.nn.functional as F
from torch_loan_default.ldp_model import LDPModel


class LDPLitModule(pl.LightningModule):
    def __init__(
        self,
        config,
        num_features,
        pytorch_model=None,
        num_classes=1,
    ):
        super().__init__()
        self.example_input_array = torch.Tensor(32, num_features)
        self.hidden_units = config["hidden_units"]
        self.learning_rate = config["learning_rate"]
        if pytorch_model is not None:
            self.model = pytorch_model
        else:
            self.model = LDPModel(
                num_features=num_features,
                num_classes=num_classes,
                hidden_units=self.hidden_units,
            )

        self.auroc = AUROC(task="binary")

        self.val_loss = []
        self.val_auroc = []
        self.save_hyperparameters()

    def forward(self, x):
        x = self.model(x)
        return x

    def training_step(self, batch, batch_idx):
        loss, auroc = self._shared_eval(batch, batch_idx)
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_auroc", auroc, prog_bar=True)
        return loss

    def on_validation_epoch_start(self):
        self.val_loss.clear()
        self.val_auroc.clear()

    def validation_step(self, batch, batch_idx):
        # this is the validation loop
        loss, auroc = self._shared_eval(batch, batch_idx)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_auroc", auroc, prog_bar=True)
        self.val_auroc.append(auroc)
        self.val_loss.append(loss)

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.val_loss).mean()
        avg_auroc = torch.stack(self.val_auroc).mean()
        self.log("ptl/val_loss", avg_loss, sync_dist=True)
        self.log("ptl/val_auroc", avg_auroc, sync_dist=True, prog_bar=True)

    def test_step(self, batch, batch_idx):
        # this is the test loop
        loss, auroc = self._shared_eval(batch, batch_idx)
        self.log("test_loss", loss)
        self.log("test_auroc", auroc)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(), lr=self.learning_rate, weight_decay=0.001
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer=optimizer, patience=5, factor=0.1, mode="min"
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "train_loss",
            },
        }

    def _shared_eval(self, batch, batch_idx):
        x, y, _ = batch
        predictions = self(x)
        loss = F.binary_cross_entropy(predictions, y)
        auroc = self.auroc(predictions, y)
        return loss, auroc


### Lightning Data Module

In [7]:
import requests
import os
import pytorch_lightning as pl
from torch.utils.data import random_split, DataLoader
import torch
from torch.utils.data import TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


class LDPDataModule(pl.LightningDataModule):
    # REMOVE_FEATURES = ["Default", "LoanID"]
    REMOVE_FEATURES = [
        "CreditScore",
        "CreditUtilizationRate",
        "Default",
        "LoanID",
        "HasCoSigner",
        "LoanPurpose",
        "HasDependents",
        "HasMortgage",
        "MaritalStatus",
        "EmploymentType",
        "Education",
        "LoanTerm",
        "NumCreditLines",
    ]
    TEXT_COLUMNS = ["LoanPurpose", "MaritalStatus"]
    YES_NO_COLUMNS = ["HasMortgage", "HasDependents", "HasCoSigner"]

    def __init__(self, data_dir: str = "./", batch_size: int = 32):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.setup_complete = False

    def prepare_data(self) -> None:
        response = requests.get(
            "https://raw.githubusercontent.com/mmtondreau/LoanDefaultPredictor/main/train.csv"
        )
        with open(os.path.join(self.data_dir, "train.csv"), "wb") as file:
            file.write(response.content)

    def setup(self, stage: str) -> None:
        if self.setup_complete == False:
            self.setup_complete = True 
            data_df = pd.read_csv(os.path.join(self.data_dir, "train.csv"))
            self.df = data_df
            self.df["ID"] = self.df.index
            self.feature_engineer(data_df)
            y_data = data_df["Default"].to_numpy()
            z_data = data_df["ID"].to_numpy()
            x_data_transformed = self.transform_data(data_df)
            self.width = x_data_transformed.shape[1]

            dataset_size = len(x_data_transformed)
            train_size = int(0.8 * dataset_size)
            val_size = int(0.1 * dataset_size)
            test_size = dataset_size - train_size - val_size

            dataset = TensorDataset(
                torch.tensor(x_data_transformed.to_numpy(), dtype=torch.float32),
                torch.tensor(y_data, dtype=torch.float32).view(-1, 1),
                torch.tensor(z_data, dtype=torch.int32),
            )

            self.train_dataset, self.val_dataset, self.test_dataset = random_split(
                dataset, (train_size, val_size, test_size)
            )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=4)

    def feature_engineer(self, df):
        df["MonthlyIncome"] = round(df["Income"] / 12.0, 2)
        df["InterestRate"] = df["InterestRate"] / 100.0
        df["MonthlyPayment"] = round(
            (
                (df["LoanAmount"] * df["InterestRate"] / 12.0)
                * ((1 + df["InterestRate"] / 12.0) ** df["LoanTerm"])
            )
            / (((1 + df["InterestRate"] / 12.0) ** df["LoanTerm"]) - 1),
            0,
        )
        df["NewDTI"] = round(
            df["DTIRatio"] + (df["MonthlyPayment"] / df["MonthlyIncome"]), 2
        )
        df["LoanToIncome"] = round(df["LoanAmount"] / df["Income"], 2)
        df["MonthlyPaymentToIncome"] = round(df["MonthlyPayment"] / df["Income"], 4)

    def one_hot(self, df, columns):
        if len(columns) == 0:
            return df
        for col in columns:
            print(col)
            categories = df[col].unique()
            category_to_index = {
                category: index for index, category in enumerate(categories)
            }
            df.loc[:, col] = df[col].map(category_to_index)
            num_categories = len(categories)
            one_hot_encoding = torch.nn.functional.one_hot(
                torch.tensor(df[col]), num_classes=num_categories
            )
            one_hot_df = pd.DataFrame(one_hot_encoding.numpy(), columns=categories)
            df = pd.concat([df, one_hot_df], axis=1)

        return df

    def transform_education(self, df):
        if "Education" in self.REMOVE_FEATURES:
            return
        education_mapping = {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3}

        df["Education"] = df["Education"].replace(education_mapping)

    def transform_empoloyment(self, df):
        if "EmploymentType" in self.REMOVE_FEATURES:
            return
        mapping = {
            "Unemployed": 0,
            "Part-time": 1,
            "Self-employed": 2,
            "Full-time": 3,
        }
        df["EmploymentType"] = df["EmploymentType"].replace(mapping)

    def transform_data(self, df):
        train_df_tmp = df.loc[:, ~df.columns.isin(self.REMOVE_FEATURES)]
        train_df_tmp = self.one_hot(
            train_df_tmp, set(self.TEXT_COLUMNS) - set(self.REMOVE_FEATURES)
        )
        self.transform_education(train_df_tmp)
        self.transform_empoloyment(train_df_tmp)

        for yes_no_column in set(self.YES_NO_COLUMNS) - set(self.REMOVE_FEATURES):
            train_df_tmp[yes_no_column] = df[yes_no_column].replace({"Yes": 1, "No": 0})

        normalized_data = self.normalize(train_df_tmp)
        return normalized_data

    def normalize(self, x):
        x_mean = np.mean(x, axis=0)
        x_std = np.std(x, axis=0)

        # Normalize each feature independently
        return (x - x_mean) / x_std


### Training

In [8]:
from torch_loan_default.ldp_data_module import LDPDataModule
from torch_loan_default.ldp_lit_module import LDPLitModule
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
from torch_loan_default.simple_data_module import EvenOddDataModule
import torch
import pandas as pd
import os


config = {
    "hidden_units": [48, 24, 12],
    "learning_rate": 0.001,
    "batch_size": 1280,
}

dm = LDPDataModule(batch_size=config["batch_size"])
dm.prepare_data()
dm.setup(stage="fit")

x, y, z = next(iter(dm.train_dataloader()))
print(x[0])
print(y[0])

df = dm.df

print(df[df["ID"] == z[0].item()].to_string())

model = LDPLitModule(config, num_features=dm.width)
trainer = pl.Trainer(
    devices="auto",
    accelerator="auto",
    max_epochs=100,
    callbacks=[
        EarlyStopping(
            monitor="ptl/val_loss", mode="min", patience=5, min_delta=0.0001
        ),
        ModelCheckpoint(
            monitor="ptl/val_auroc", mode="max", filename="{epoch}-{val_auroc:.2f}"
        ),
    ],
)
trainer.fit(model, datamodule=dm)



GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


tensor([ 1.0341, -0.5340,  0.0670,  1.2833,  0.3672, -1.2135, -1.6108, -0.5340,
         0.1746, -0.1196, -0.0172,  0.0700])
tensor([0.])
          LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  NumCreditLines  InterestRate  LoanTerm  DTIRatio   Education EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose HasCoSigner  Default    ID  MonthlyIncome  MonthlyPayment  NewDTI  LoanToIncome  MonthlyPaymentToIncome
8940  XAP1XVO34R   59   61694      132326          677             104               4        0.1593        24      0.22  Bachelor's     Unemployed        Single         Yes            No       Other         Yes        0  8940        5141.17          6475.0    1.48          2.14                   0.105



  | Name  | Type        | Params | In sizes | Out sizes
-------------------------------------------------------------
0 | model | LDPModel    | 2.1 K  | [32, 12] | [32, 1]  
1 | auroc | BinaryAUROC | 0      | ?        | ?        
-------------------------------------------------------------
2.1 K     Trainable params
0         Non-trainable params
2.1 K     Total params
0.008     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Test

In [9]:

trainer.test(model, datamodule=dm)

model.eval()

x, y, _ = next(iter(dm.test_dataloader()))
y_hat = model(x)
print(x)
print(torch.flatten(y))
print(torch.flatten(y_hat))

Testing: 0it [00:00, ?it/s]

tensor([[ 0.7673, -0.5231,  0.9716,  ...,  0.0758,  0.4510, -0.1525],
        [-0.6336,  1.6030, -1.6077,  ..., -0.5239, -0.9583, -0.7196],
        [-0.4335, -1.5474, -0.7437,  ...,  0.0623,  0.5474, -0.0208],
        ...,
        [ 0.3003, -1.0797,  0.2655,  ...,  0.5138,  0.6622,  0.5790],
        [ 0.2336, -1.3298, -1.3094,  ..., -0.5576, -0.4809, -0.4602],
        [ 0.0335,  0.0680, -1.4576,  ..., -0.5239, -0.8665, -0.5797]])
tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor([0.0847, 0.1316, 0.2257,  ..., 0.0596, 0.0694, 0.0701],
       grad_fn=<ReshapeAliasBackward0>)
