In [17]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix

from imblearn.over_sampling import SMOTE, ADASYN

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [18]:
def AUC_score(y_ground_truth, y_predicted_probability):
    return roc_auc_score(y_ground_truth, y_predicted_probability)

def to_submission(ids, y_test_predicted_probability):
    y_test=pd.DataFrame(y_test_predicted_probability,columns=['loan_status'], index=ids)
    y_test.index.name = 'id'
    y_test.to_csv('data/submission.csv')
    
def regularization_fn(mdl, lambda_l2=1e-3, lambda_l1=0):
    """Regularization function

    Parameters
    ----------
    mdl: PyTorch model, such as `Sequential`
    lambda_l2: l2-regularization strength for all weights
    labmda_l1: l1-regularization strength for all weights
    """
    # Initializa regularization term
    reg = 0
    for (name, param) in model.named_parameters():
        # Usually, don't regularize bias terms
        if 'weight' in name:
            # l2-regularization
            reg = reg + lambda_l2 * param.square().sum()
            # l1-regularization
            reg = reg + lambda_l1 * param.abs().sum()
    return reg
    
data_train=pd.read_csv("data/2022-02-07_LOANS_TRAIN.csv")
data_test=pd.read_csv("data/2022-02-07_LOANS_TEST.csv")
y_submission_id = data_test["id"].to_numpy()

data_train.drop(columns="id", inplace=True)
data_test.drop(columns="id", inplace=True)

X_train = data_train.drop(columns='loan_status').to_numpy()
y_train = data_train['loan_status'].to_numpy()
X_test = data_test.to_numpy()

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20)

print(sorted(Counter(y_train).items()))
X_train, y_train = SMOTE().fit_resample(X_train, y_train)
print(sorted(Counter(y_train).items()))

[(0, 133549), (1, 24251)]
[(0, 133549), (1, 133549)]


In [20]:
train_ds = TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train).type(torch.LongTensor))
test_ds = TensorDataset(torch.Tensor(X_val), torch.Tensor(y_val).type(torch.LongTensor))

In [58]:
n_features = X_train.shape[1]

l1 = 300
l2 = 100
l3 = 50
lout = 2

dropout=0

lambda_l2=1e-3
lr = 1e-3
momentum=0.1

model = nn.Sequential(
    nn.Flatten(),  
    nn.Linear(n_features, l1),
    nn.ReLU(),
    nn.Dropout(dropout),

    nn.Linear(l1, l2),
    nn.ReLU(),
    nn.Dropout(dropout),

    nn.Linear(l2, l3),
    nn.ReLU(),
    nn.Dropout(dropout),

    nn.Linear(l3, lout),
    nn.Softmax(dim=1)
)

In [59]:
bs = 32
n_epochs = 10

train_loader = DataLoader(train_ds, batch_size=bs, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=bs*4, shuffle=True)

# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, momentum=momentum)
loss_fn = nn.CrossEntropyLoss()

training_accuracy_history = np.zeros([n_epochs, 1])
training_loss_history = np.zeros([n_epochs, 1])
validation_accuracy_history = np.zeros([n_epochs, 1])
validation_loss_history = np.zeros([n_epochs, 1])

for epoch in range(n_epochs):
    print(f'Epoch {epoch+1}/{n_epochs}:', end='')

    if epoch==5:
        new_state = optimizer.state_dict()
        new_state['param_groups'][0]['lr']=lr/10
        new_state['param_groups'][0]['momentum']=0.75
        optimizer.load_state_dict(new_state)

    train_total = 0
    train_correct = 0
    model.train()

    for batch_idx, (data, label) in enumerate(train_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()
        # Forward pass
        output = model(data)
        # Calculate loss
        loss = loss_fn(output, label) + regularization_fn(model, lambda_l2=lambda_l2)
        # loss = loss_fn(output, label)
        # Backward pass
        loss.backward()
        # Weight update
        optimizer.step()

        # track training accuracy
        _, predicted = torch.max(output.data, 1)
        
        
        y_pred=output.data[:,1].numpy()
        try:
            score = AUC_score(label, y_pred)
        except:
            score = -1
        
        train_total += label.size(0)
        train_correct += (predicted == label).sum().item()
        # track training loss
        training_loss_history[epoch] += loss.item()

    training_loss_history[epoch] /= len(train_loader)
    training_accuracy_history[epoch] = train_correct / train_total
    print(f' loss: {training_loss_history[epoch,0]:0.4f}, acc: {training_accuracy_history[epoch,0]:0.4f}, AUC: {score:0.4f}',end='')

    # validate
    test_total = 0
    test_correct = 0

    with torch.no_grad():
        model.eval()
        for batch_idx, (data, label) in enumerate(test_loader):
            # forward pass
            output = model(data)
            
            y_pred=output.data[:,1].numpy()
            try: 
                score = AUC_score(label, y_pred)
            except:
                score = -1
            
            # find accuracy
            _, predicted = torch.max(output.data, 1)
            test_total += label.size(0)
            test_correct += (predicted == label).sum().item()
            # find loss
            loss = loss_fn(output, label)
            validation_loss_history[epoch] += loss.item()

            validation_loss_history[epoch] /= len(test_loader)
            validation_accuracy_history[epoch] = test_correct / test_total

    print(f', val loss: {validation_loss_history[epoch,0]:0.8f}, val acc: {validation_accuracy_history[epoch,0]:0.4f}, val AUC: {score:0.4f}')

Epoch 1/10: loss: 0.6384, acc: 0.6620, AUC: 0.6786, val loss: 0.00203204, val acc: 0.5885, val AUC: 0.8542
Epoch 2/10: loss: 0.6264, acc: 0.6818, AUC: 0.8303, val loss: 0.00184794, val acc: 0.6681, val AUC: 0.8125
Epoch 3/10: loss: 0.6234, acc: 0.6879, AUC: 0.7394, val loss: 0.00168882, val acc: 0.6820, val AUC: 0.8400
Epoch 4/10: loss: 0.6226, acc: 0.6909, AUC: 0.7337, val loss: 0.00192229, val acc: 0.6180, val AUC: 0.8333
Epoch 5/10: loss: 0.6217, acc: 0.6937, AUC: 0.7811, val loss: 0.00191642, val acc: 0.6459, val AUC: 0.7955
Epoch 6/10: loss: 0.6101, acc: 0.7097, AUC: 0.7212, val loss: 0.00204661, val acc: 0.6680, val AUC: 0.6381
Epoch 7/10: loss: 0.6068, acc: 0.7135, AUC: 0.7937, val loss: 0.00182083, val acc: 0.6495, val AUC: 0.3768
Epoch 8/10: loss: 0.6057, acc: 0.7151, AUC: 0.7758, val loss: 0.00204569, val acc: 0.6469, val AUC: 0.7905
Epoch 9/10: loss: 0.6049, acc: 0.7155, AUC: 0.8788, val loss: 0.00256728, val acc: 0.6515, val AUC: 0.3250
Epoch 10/10: loss: 0.6042, acc: 0.716

In [60]:
model.eval()
output = model(torch.Tensor(X_val))
y_pred=output.data[:,1].numpy()

AUC_score(y_val, y_pred)

0.6561848643296664

In [61]:
_, predicted = torch.max(output.data, 1)
np.sum(y_val==predicted.numpy()) / len(y_val)

0.6786058301647655

In [52]:
(output.data).max(axis=1)[0].mean()

tensor(0.9640)