In [None]:
# import sys
# from google.colab import drive

# drive.mount('/content/gdrive', force_remount=True)
# SRC_PATH = '/content/gdrive/MyDrive/MP FEB/Colab'
# sys.path.append(SRC_PATH)

# !pip install wandb -qqq
# import wandb
# wandb.login()

# wandb.init(
#     # Set the project where this run will be logged
#     project="baseline_poc", 
#     # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
#     #name="experiment 1"
#     # Track hyperparameters and run metadata
#     #config={
#       #"learning_rate": 0.02,
#       #"architecture": "CNN",
#       #"dataset": "CIFAR-100",
#       #"epochs": 10,}
#     )

In [None]:
%load_ext autoreload
%autoreload 2

import dask.dataframe as dd
import os
import numpy as np
import pandas as pd
import torch

import matplotlib.pyplot as plt
%matplotlib inline

DEVICE = 'cuda'

# 1. Import Dataset

### 1.1 Loading raw data

In [None]:
data_folder = str(os.getcwd()) + "\data"

blumenstock_types = {'LOAN_SEQUENCE_NUMBER': str, 'INT_RATE': float, 'ORIG_UPB': float, 'FICO_SCORE': float,
                    'DTI_R': float, 'LTV_R': float, 'FIRST_PAYMENT_DATE': str, 'BAL_REPAID': float, 'T_ACT_12M': float, 'T_DEL_30D': float, 
                    'T_DEL_60D': float, 'LABEL': float, 'REMAINING_MONTHS_TO_LEGAL_MATURITY': float, "TIME_TO_EVENT": float, 'TOTAL_OBSERVED_LENGTH': float}


df_blumenstock = dd.read_parquet(data_folder + "./blumenstock_labeled_sample_orig_*.parquet.gzip")
df_blumenstock = df_blumenstock.astype(blumenstock_types)

In [None]:
df_blumenstock.head(10)

### 1.2 Normalising raw data

In [None]:
covariates_to_normalise = ['INT_RATE', 'ORIG_UPB', 'FICO_SCORE', 'DTI_R', 'LTV_R', 'REMAINING_MONTHS_TO_LEGAL_MATURITY',
                           'BAL_REPAID', 'T_ACT_12M', 'T_DEL_30D', 'T_DEL_60D']

df_blumenstoch_mean = df_blumenstock[covariates_to_normalise].mean().compute()
df_blumenstoch_std = df_blumenstock[covariates_to_normalise].std().compute()

df_blumenstock[covariates_to_normalise] = (df_blumenstock[covariates_to_normalise] - df_blumenstoch_mean) / df_blumenstoch_std


In [None]:
df_blumenstock.head(10)

### 1.3 Creating dataloaders

In [None]:
from torch.utils.data import DataLoader
from FREDDIEMAC_baseline_data import FREDDIEMAC_basline_dataset, FREDDIEMAC_baseline_dataloader

BATCH_SIZE = 2**8

frac_cases=0.25
test_set = False
augment = False
data_augment_factor = 3
random_state = 42

allowed_covariates = ['INT_RATE', 'ORIG_UPB', 'FICO_SCORE', 'DTI_R','LTV_R', 'BAL_REPAID', 
                     'T_ACT_12M', 'T_DEL_30D', 'T_DEL_60D', 'REMAINING_MONTHS_TO_LEGAL_MATURITY']

TIME_TO_EVENT_covariate = 'TIME_TO_EVENT'
LABEL_covariate = 'LABEL'


FREDDIEMAC_raw_dataset = FREDDIEMAC_basline_dataset(df_blumenstock, 
                                                    allowed_covariates,
                                                    TIME_TO_EVENT_covariate,
                                                    LABEL_covariate,
                                                    frac_cases,
                                                    random_state,
                                                    test_set,
                                                    augment,
                                                    data_augment_factor)

print("This dataset will contain %d samples" % len(FREDDIEMAC_raw_dataset))

data_loader = FREDDIEMAC_dataloader(dataset=FREDDIEMAC_raw_dataset,
                                    batch_size=BATCH_SIZE)

print("This dataloader will deliver %d batches" % data_loader.get_max_iterations())

batch_data, batch_data_length, batch_event, batch_tte = next(data_loader)

print(batch_data.shape)
print(batch_data_length.shape)
print(batch_event.shape)
print(batch_tte.shape)

### Data Exploration

In [None]:
batch_data, batch_data_length, batch_event, batch_tte = next(iter(data_loader))

for i in range(min(BATCH_SIZE, 16)):
    print("batch event= %d --- batch_data_length= %d --- batch_tte= %d" % (batch_event[i], batch_data_length[i], batch_tte[i]))

### Test Sample (that is possibly in training set right now :p )

In [None]:
frac_cases=0.00004

FREDDIEMAC_raw_dataset = FREDDIEMAC_basline_dataset(df_blumenstock, 
                                                    allowed_covariates,
                                                    TIME_TO_EVENT_covariate,
                                                    LABEL_covariate,
                                                    frac_cases,
                                                    random_state,
                                                    test_set,
                                                    augment,
                                                    data_augment_factor)

test_data_loader = FREDDIEMAC_dataloader(dataset=FREDDIEMAC_raw_dataset, batch_size=1)
test_batch_data, test_batch_data_length, test_batch_event, test_batch_tte = next(iter(test_data_loader))
print("This dataset will contain %d samples" % len(FREDDIEMAC_raw_dataset))
print("This dataloader will deliver %d batches" % test_data_loader.get_max_iterations())
print("batch event= %d --- batch_data_length= %d --- batch_tte= %d" % (test_batch_event[0], test_batch_data_length[0], test_batch_tte[0]))

# 2. Hyperparameters

In [None]:
from torch.nn import MSELoss
from tqdm import trange, tqdm

from torch.optim import Adam

from deepHit import Encoder, CauseSpecificSubnetwork, DeepHit
from baseline_losses import loss_1_batch, loss_2_batch

NUM_EPOCHS = 10

LEARNING_RATE_ENCODER = 1e-3
LEARNING_RATE_CAUSESS = 1e-3

LOSS_1_AMPLIFIER = 1
LOSS_2_AMPLIFIER = 1

RUN_VALIDATION_ROUND = False
VAL_NUM_CASES_RUNTIME = BATCH_SIZE

input_size = FREDDIEMAC_raw_dataset.get_num_covariates()
output_size = FREDDIEMAC_raw_dataset.get_num_covariates()
MAX_LENGTH = FREDDIEMAC_raw_dataset.get_max_length()
NUM_CAUSES = 3
hidden_size_encoder = 512
context_size = 256
hidden_cause_size = 512
SIGMA = 0.1

# 3. Defining The Model

In [None]:
# initialize model
encoder = Encoder(input_size, hidden_size_encoder, context_size).to(DEVICE)
causess = CauseSpecificSubnetwork(context_size, hidden_cause_size, input_size, MAX_LENGTH, NUM_CAUSES).to(DEVICE)
DHT = DeepHit(encoder, causess, DEVICE)

# intialize optimizer
optimizer_encoder = Adam(encoder.parameters(), lr=LEARNING_RATE_ENCODER)
optimizer_causess = Adam(causess.parameters(), lr=LEARNING_RATE_CAUSESS)

### Testing a sample before training

In [None]:
from utils import plot_fht_and_cif_baseline
from baseline_losses import CIF_K

test_batch_data, test_batch_data_length, test_batch_event, test_batch_tte = next(iter(test_data_loader))

test_batch_data = test_batch_data.unsqueeze(0).to(DEVICE)
test_batch_data_length = test_batch_data_length.unsqueeze(0).to(DEVICE)
test_batch_event = test_batch_event.unsqueeze(0).to(DEVICE)

DHT.eval()

test_first_hitting_time = DHT(test_batch_data, test_batch_data_length)
print("sample has length %d" % test_batch_data_length[0])

test_first_hitting_time_argmax = test_first_hitting_time.argmax().item()
model_event_prediction = test_first_hitting_time_argmax // MAX_LENGTH
model_tte_prediction = test_first_hitting_time_argmax % MAX_LENGTH
print("the model predicts the event %d at time %d" % (model_event_prediction, model_tte_prediction + 1))

print("probability of prepay event = %.2f" % CIF_K(test_first_hitting_time[0], 0, MAX_LENGTH)[23].item())
print("probability of default event = %.2f" % CIF_K(test_first_hitting_time[0], 1, MAX_LENGTH)[23].item())
print("probability of full repay event = %.2f" % CIF_K(test_first_hitting_time[0], 2, MAX_LENGTH)[23].item())

plot_fht_and_cif_baseline(test_first_hitting_time[0], MAX_LENGTH)

In [None]:
#PATH = "/content/gdrive/MyDrive/MP FEB/Colab/models/baseline_model_v4.pth"

train_data_loader = data_loader

# start training
for epoch in range(NUM_EPOCHS):
  epoch_loss = 0

  for batch_number in range(len(train_data_loader)):
    data = next(train_data_loader)

    batch_loss = 0

    optimizer_encoder.zero_grad()
    optimizer_causess.zero_grad()

    batch_data, batch_data_length, batch_event, batch_tte = data
    batch_data = batch_data.to(DEVICE)
    batch_data_length = batch_data_length.to(DEVICE)
    batch_event = batch_event.to(DEVICE)
    batch_tte = batch_tte.to(DEVICE)
    
    first_hitting_time_batch = DHT(batch_data, batch_data_length)

    loss1 = LOSS_1_AMPLIFIER*loss_1_batch(first_hitting_time_batch, batch_event, batch_tte, MAX_LENGTH, DEVICE)
    loss2 = LOSS_2_AMPLIFIER*loss_2_batch(first_hitting_time_batch, batch_event, batch_tte, NUM_CAUSES, MAX_LENGTH, SIGMA, DEVICE)

    batch_loss = loss1 + loss2
    batch_loss.backward()

    epoch_loss += batch_loss.detach()

    #wandb.log({"train_loss1": loss1.item(), "train_loss2": loss2.item()})
    print({"train_loss1": loss1.item(), "train_loss2": loss2.item()})

    optimizer_encoder.step()
    optimizer_causess.step()

    # if batch_number % 2**8 == 0:
    #   torch.save(DHT.state_dict(), PATH)

#   if RUN_VALIDATION_ROUND:
#     # validating round
#     DHT.eval()

#     with torch.no_grad():
#       val_poc_raw_dataset = PocDataset(num_cases=VAL_NUM_CASES_RUNTIME)
#       val_data_loader = torch.utils.data.DataLoader(val_poc_raw_dataset,batch_size=VAL_NUM_CASES_RUNTIME)
#       val_batch_data, val_data_length, val_batch_event, val_batch_tte, _ = next(iter(val_data_loader))
#       val_batch_data = val_batch_data.to(DEVICE)
#       val_data_length = val_data_length.to(DEVICE)
#       val_batch_event = val_batch_event.to(DEVICE)
#       val_batch_tte = val_batch_tte.to(DEVICE)

#       val_first_hitting_time_batch = DHT(val_batch_data, val_data_length)

#       val_loss1 = LOSS_1_AMPLIFIER*loss_1_batch(val_first_hitting_time_batch, val_batch_event, val_batch_tte, MAX_LENGTH, DEVICE)/VAL_NUM_CASES_RUNTIME
#       val_loss2 = LOSS_2_AMPLIFIER*loss_2_batch(val_first_hitting_time_batch, val_batch_event, val_batch_tte, NUM_CAUSES, MAX_LENGTH, SIGMA, DEVICE)/VAL_NUM_CASES_RUNTIME

#       wandb.log({"val_loss1": val_loss1.item(), "val_loss2": val_loss2.item()})
#       wandb.log({"train_epoch_loss" : epoch_loss.item(), "val_epoch_loss" : val_loss1.item() + val_loss2.item(),"epoch": epoch})

#     DHT.train()
#     # end validating round

  #torch.save(DHT.state_dict(), PATH)

#wandb.finish() 

In [None]:
from utils import plot_fht_and_cif_baseline
from baseline_losses import CIF_K

test_batch_data, test_batch_data_length, test_batch_event, test_batch_tte = next(iter(test_data_loader))
test_batch_data = test_batch_data.unsqueeze(0).to(DEVICE)
test_batch_data_length = test_batch_data_length.unsqueeze(0).to(DEVICE)
test_batch_event = test_batch_event.unsqueeze(0).to(DEVICE)

DHT.eval()

test_first_hitting_time = DHT(test_batch_data, test_batch_data_length)
print("sample has length %d" % test_batch_data_length[0])

test_first_hitting_time_argmax = test_first_hitting_time.argmax().item()
model_event_prediction = test_first_hitting_time_argmax // MAX_LENGTH
model_tte_prediction = test_first_hitting_time_argmax % MAX_LENGTH
print("the model predicts the event %d at time %d" % (model_event_prediction, model_tte_prediction + 1))

print("probability of prepay event = %.2f" % CIF_K(test_first_hitting_time[0], 0, MAX_LENGTH)[23].item())
print("probability of default event = %.2f" % CIF_K(test_first_hitting_time[0], 1, MAX_LENGTH)[23].item())
print("probability of full repay event = %.2f" % CIF_K(test_first_hitting_time[0], 2, MAX_LENGTH)[23].item())

plot_fht_and_cif_baseline(test_first_hitting_time[0], MAX_LENGTH)