### Importing Essential Packages

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
torch.cuda.set_device(0) # cuda

### Loading and preparing the dataset 

In [None]:
BASE = './EQ_Dataset/'
df_eq = pd.read_csv(BASE+'df_EQ_dataset.csv', encoding='utf8')

In [None]:
print(len(df_eq.columns))
# df_eq.info()

In [None]:
df_eq.columns

In [None]:
df_eq['setupYear'].unique()

In [None]:
df_eq['minDOB'] = df_eq['minDOB']/10 #convert to decade
df_eq['minDOB'] = df_eq['minDOB'].astype(int)
df_eq['setupYear'] = df_eq['setupYear']/10 #convert to decade
df_eq['setupYear'] = df_eq['setupYear'].astype(int)

In [None]:
# Decide categorical and continuous variables
# cat_names = ['billOrder', 'billType', 'dueDayW', 'dueMonth','dueDayM', 'dueYear', 'billDayM', 'billMonth', 
#              'billRoute', 'numAccountHolders', 'hasMailAddress', 'city', 'postcode', 'incomeGroup', 
#              'wealthGroup', 'segment', 'RA_CODE_2016', 'AverageHhdSize', 'MB_CODE_2016', 'SA1_7DIGITCODE_2016', 
#              'medianPersonPerBedroom', 'minDOB', 'setupYear']

cat_names = ['billOrder', 'billType', 'dueMonth', 'billRoute', 'numAccountHolders', 'hasMailAddress', 'postcode', 'incomeGroup', 
             'wealthGroup', 'AverageHhdSize', 'minDOB', 'setupYear']

cont_names = ['billDuration', 'medianHhdIncWkly', 'medianMortgageWkly', 'medianRentWkly']

target_name = ['billPaid']

# check which variables are excluded
exclude_names = list(set(df_eq.columns) - set(cat_names) - set(cont_names) - set(target_name))
print(len(cat_names), len(cont_names), len(exclude_names), len(target_name))

In [None]:
exclude_names

In [None]:
df_eq = df_eq.drop(exclude_names, axis=1)

In [None]:
df_eq = df_eq.sample(10000).reset_index(drop=True) # take a subset, remove this line after code testing
df_eq

### Perform LabelEncoding for categorical variables

In [None]:
for col in cat_names:
    df_eq[col] = LabelEncoder().fit_transform(df_eq[col])

In [None]:
print(len(df_eq.columns))
df_eq

In [None]:
# df_eq.info()

In [None]:
# making all categorical variables
for col in cat_names:
    df_eq[col] = df_eq[col].astype('category')

### Train Test Split

In [None]:
print('Classes: ', df_eq['billPaid'].unique())
print('----Class Distrution----')
print(df_eq['billPaid'].value_counts()/len(df_eq)*100)

In [None]:
# Split Dataset into Train and Test Set
df_train = df_eq.iloc[0 : int(len(df_eq)*0.8)]
df_test = df_eq.iloc[int(len(df_eq)*0.8):]
del df_eq
len(df_train), len(df_test)

### Define and Create Pytorch Dataset

In [None]:
class EqDataset(Dataset):
    def __init__(self, X, y, embedded_col_names):
        X = X.copy()
        self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
        self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]

In [None]:
y_train = df_train['billPaid'].values
X_train = df_train.drop(['billPaid'], axis=1)

y_test = df_test['billPaid'].values
X_test = df_test.drop(['billPaid'], axis=1)

In [None]:
#creating train and valid datasets and dataloaders
train_ds = EqDataset(X_train, y_train, cat_names)
valid_ds = EqDataset(X_test, y_test, cat_names)

batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)

### Making device (GPU/CPU) compatible

In order to make use of a GPU if available, we'll have to move our data and model to it.

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
#         return torch.device('cpu')
    else:
        return torch.device('cpu')
    
device = get_default_device()
device

In [None]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [None]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [None]:
train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)

### Prepare Embedding sizes for Categorical Variables

In [None]:
#categorical embedding for columns having more than two values
colname_ncats_paris = {col_name: len(col.cat.categories) for col_name,col in df_train[cat_names].items()}

# Determining size of embedding
embedding_sizes = [(n_categories, min(10, (n_categories+1)//2)) for _, n_categories in colname_ncats_paris.items()]
embedding_sizes # embedding_sizes :: (size of the dictionary of embeddings, the size of each embedding vector)

# Model

##### DNN (MLP)

In [None]:
class DNN(nn.Module):
    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(n_categories, em_dim) for n_categories, em_dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
        self.n_emb, self.n_cont = n_emb, n_cont
        
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 100)
        self.lin2 = nn.Linear(100, 50)
        self.lin3 = nn.Linear(50, 2)
        
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(100)
        self.bn3 = nn.BatchNorm1d(50)
        
        self.emb_drop = nn.Dropout(0.2)
        self.drops = nn.Dropout(0.2)
        

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x) #categorical
        
        x2 = self.bn1(x_cont) #numerical
        x = torch.cat([x, x2], 1)
        
        x = self.lin1(x)
        x = F.relu(x) #layer 1
        x = self.drops(x)
        x = self.bn2(x)
        
        x = F.relu(self.lin2(x)) #layer 2
        x = self.drops(x)
        x = self.bn3(x)
        
        x = self.lin3(x) #final layer
        return x

In [None]:
model = DNN(embedding_sizes, len(cont_names)) # use this for DNN (MLP)
to_device(model, device);

#### Defining loss function and optimizer

In [None]:
def get_optimizer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optimizer

cross_entropy_loss = nn.CrossEntropyLoss()
lr=0.01 # learning rate 
wd=0.0 # weight decay
optimizer = get_optimizer(model, lr = lr, wd = wd)

### Training DNN

In [None]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

def train_model(model, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        output = model(x1, x2)
        loss = cross_entropy_loss(output, y)   
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total


def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        out = model(x1, x2)
        loss = cross_entropy_loss(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(out, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total


def train_loop(model, epochs):
    early_stopper = EarlyStopper(patience=3, min_delta=0)
    for i in range(epochs): 
        loss = train_model(model, train_dl)
        print("-Epoch: {} training loss: {}".format(i, loss))
        vloss, vaccu = val_loss(model, valid_dl)
        print()
        if early_stopper.early_stop(vloss):             
            break
        

train_loop(model, epochs=30)

### Defining a simple Bayesian model

prior_mu (Float) is the mean of prior normal distribution.

prior_sigma (Float) is the sigma of prior normal distribution.

Rference:

https://jovian.ai/aakanksha-ns/shelter-outcome