### Importing Essential Packages

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
torch.cuda.set_device(0) # cuda

### Loading and preparing the dataset 

In [3]:
BASE = './EQ_Dataset/'
df_eq = pd.read_csv(BASE+'df_EQ_dataset.csv', encoding='utf8')

In [4]:
print(len(df_eq.columns))
# df_eq.info()

34


In [5]:
df_eq.columns

Index(['uuid', 'billOrder', 'billDuration', 'billAmmount', 'billType',
       'paymentsMade', 'agedDebtOwing', 'totalPayments', 'overDue', 'billPaid',
       'dueDayM', 'dueMonth', 'dueYear', 'dueDayW', 'billDayM', 'billMonth',
       'billRoute', 'numAccountHolders', 'setupYear', 'minDOB',
       'hasMailAddress', 'city', 'postcode', 'incomeGroup', 'wealthGroup',
       'segment', 'RA_CODE_2016', 'MB_CODE_2016', 'SA1_7DIGITCODE_2016',
       'medianMortgageWkly', 'medianRentWkly', 'medianHhdIncWkly',
       'medianPersonPerBedroom', 'AverageHhdSize'],
      dtype='object')

In [6]:
df_eq['setupYear'].unique()

array([2017., 2010., 2016., 2019., 2004., 2015., 2018., 2012., 2007.,
       1983., 1984., 1992., 2014., 2001., 2003., 1993., 2013., 2008.,
       1996., 1999., 1985., 2009., 1980., 1968., 2011., 1991., 2002.,
       1997., 1981., 2006., 1988., 1998., 2005., 2000., 1987., 1977.,
       1954., 1976., 1965., 1989., 1975., 1994., 1973., 1974., 1970.,
       1995., 1986., 1990., 1955., 1969., 1961., 1979., 1978., 1971.,
       1982., 1967., 1958., 1966., 1963., 1972., 1964., 1943., 1956.,
       1959., 1962., 1952., 1960., 1946., 1957., 1950., 1953., 1941.,
       1947., 1949., 1948., 1951., 1936., 1938., 1925.])

In [7]:
df_eq['minDOB'] = df_eq['minDOB']/10 #convert to decade
df_eq['minDOB'] = df_eq['minDOB'].astype(int)
df_eq['setupYear'] = df_eq['setupYear']/10 #convert to decade
df_eq['setupYear'] = df_eq['setupYear'].astype(int)

In [8]:
# Decide categorical and continuous variables
# cat_names = ['billOrder', 'billType', 'dueDayW', 'dueMonth','dueDayM', 'dueYear', 'billDayM', 'billMonth', 
#              'billRoute', 'numAccountHolders', 'hasMailAddress', 'city', 'postcode', 'incomeGroup', 
#              'wealthGroup', 'segment', 'RA_CODE_2016', 'AverageHhdSize', 'MB_CODE_2016', 'SA1_7DIGITCODE_2016', 
#              'medianPersonPerBedroom', 'minDOB', 'setupYear']

cat_names = ['billOrder', 'billType', 'dueMonth', 'billRoute', 'numAccountHolders', 'hasMailAddress', 'postcode', 'incomeGroup', 
             'wealthGroup', 'AverageHhdSize', 'minDOB', 'setupYear']

cont_names = ['billDuration', 'medianHhdIncWkly', 'medianMortgageWkly', 'medianRentWkly']

target_name = ['billPaid']

# check which variables are excluded
exclude_names = list(set(df_eq.columns) - set(cat_names) - set(cont_names) - set(target_name))
print(len(cat_names), len(cont_names), len(exclude_names), len(target_name))

12 4 17 1


In [9]:
exclude_names

['totalPayments',
 'overDue',
 'uuid',
 'dueYear',
 'dueDayM',
 'segment',
 'MB_CODE_2016',
 'SA1_7DIGITCODE_2016',
 'billMonth',
 'paymentsMade',
 'dueDayW',
 'medianPersonPerBedroom',
 'billDayM',
 'agedDebtOwing',
 'billAmmount',
 'city',
 'RA_CODE_2016']

In [10]:
df_eq = df_eq.drop(exclude_names, axis=1)

In [11]:
df_eq = df_eq.sample(10000).reset_index(drop=True) # take a subset, remove this line after code testing
df_eq

Unnamed: 0,billOrder,billDuration,billType,billPaid,dueMonth,billRoute,numAccountHolders,setupYear,minDOB,hasMailAddress,postcode,incomeGroup,wealthGroup,medianMortgageWkly,medianRentWkly,medianHhdIncWkly,AverageHhdSize
0,11,31,cycle,1,9,postal,3.0,199,193,n,4350.0,2.0,5.0,350.076923,395.0,1638.0,2.5
1,14,86,cycle,1,5,postal,1.0,201,194,y,4700.0,2.0,1.0,384.923077,210.0,974.0,1.9
2,8,88,cycle,1,1,postal,1.0,200,195,n,4814.0,2.0,1.0,353.076923,250.0,735.0,2.0
3,9,96,cycle,0,2,email,1.0,201,197,n,4655.0,2.0,1.0,334.615385,313.0,796.0,2.0
4,8,87,cycle,0,1,email,2.0,200,195,y,4380.0,3.0,4.0,282.461538,188.0,1134.0,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3,90,cycle,0,12,postal,1.0,201,198,n,4700.0,1.0,1.0,320.076923,250.0,1187.0,2.3
9996,6,94,cycle,0,7,email,2.0,199,195,y,4895.0,2.0,1.0,279.923077,200.0,1007.0,2.4
9997,8,89,cycle,0,11,email,2.0,201,196,y,4405.0,6.0,3.0,399.923077,300.0,1712.0,3.1
9998,4,91,cycle,0,12,postal,2.0,200,196,n,4754.0,6.0,3.0,360.000000,158.0,959.0,2.2


### Perform LabelEncoding for categorical variables

In [12]:
for col in cat_names:
    df_eq[col] = LabelEncoder().fit_transform(df_eq[col])

In [13]:
print(len(df_eq.columns))
df_eq

17


Unnamed: 0,billOrder,billDuration,billType,billPaid,dueMonth,billRoute,numAccountHolders,setupYear,minDOB,hasMailAddress,postcode,incomeGroup,wealthGroup,medianMortgageWkly,medianRentWkly,medianHhdIncWkly,AverageHhdSize
0,10,31,0,1,8,2,2,5,2,0,1,1,4,350.076923,395.0,1638.0,12
1,13,86,0,1,4,2,0,7,3,1,102,1,0,384.923077,210.0,974.0,6
2,7,88,0,1,0,2,0,6,4,0,156,1,0,353.076923,250.0,735.0,7
3,8,96,0,0,1,0,0,7,6,0,86,1,0,334.615385,313.0,796.0,7
4,7,87,0,0,0,0,1,6,4,1,25,2,3,282.461538,188.0,1134.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,90,0,0,11,2,0,7,7,0,102,0,0,320.076923,250.0,1187.0,10
9996,5,94,0,0,6,0,1,5,4,1,194,1,0,279.923077,200.0,1007.0,11
9997,7,89,0,0,10,0,1,7,5,1,32,5,2,399.923077,300.0,1712.0,18
9998,3,91,0,0,11,2,1,6,5,0,141,5,2,360.000000,158.0,959.0,9


In [14]:
# df_eq.info()

In [15]:
# making all categorical variables
for col in cat_names:
    df_eq[col] = df_eq[col].astype('category')

### Train Test Split

In [16]:
print('Classes: ', df_eq['billPaid'].unique())
print('----Class Distrution----')
print(df_eq['billPaid'].value_counts()/len(df_eq)*100)

Classes:  [1 0]
----Class Distrution----
1    58.5
0    41.5
Name: billPaid, dtype: float64


In [17]:
# Split Dataset into Train and Test Set
df_train = df_eq.iloc[0 : int(len(df_eq)*0.8)]
df_test = df_eq.iloc[int(len(df_eq)*0.8):]
del df_eq
len(df_train), len(df_test)

(8000, 2000)

### Define and Create Pytorch Dataset

In [18]:
class EqDataset(Dataset):
    def __init__(self, X, y, embedded_col_names):
        X = X.copy()
        self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns
        self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.y[idx]

In [19]:
y_train = df_train['billPaid'].values
X_train = df_train.drop(['billPaid'], axis=1)

y_test = df_test['billPaid'].values
X_test = df_test.drop(['billPaid'], axis=1)

In [20]:
#creating train and valid datasets and dataloaders
train_ds = EqDataset(X_train, y_train, cat_names)
valid_ds = EqDataset(X_test, y_test, cat_names)

batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)

### Making device (GPU/CPU) compatible

In order to make use of a GPU if available, we'll have to move our data and model to it.

In [21]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
#         return torch.device('cpu')
    else:
        return torch.device('cpu')
    
device = get_default_device()
device

device(type='cuda')

In [22]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [23]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [24]:
train_dl = DeviceDataLoader(train_dl, device)
valid_dl = DeviceDataLoader(valid_dl, device)

### Prepare Embedding sizes for Categorical Variables

In [25]:
#categorical embedding for columns having more than two values
colname_ncats_paris = {col_name: len(col.cat.categories) for col_name,col in df_train[cat_names].items()}

# Determining size of embedding
embedding_sizes = [(n_categories, min(10, (n_categories+1)//2)) for _, n_categories in colname_ncats_paris.items()]
embedding_sizes # embedding_sizes :: (size of the dictionary of embeddings, the size of each embedding vector)

[(39, 10),
 (2, 1),
 (12, 6),
 (3, 2),
 (3, 2),
 (2, 1),
 (195, 10),
 (7, 4),
 (7, 4),
 (29, 10),
 (10, 5),
 (8, 4)]

# Model

##### DNN (MLP)

In [26]:
class DNN(nn.Module):
    def __init__(self, embedding_sizes, n_cont):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(n_categories, em_dim) for n_categories, em_dim in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
        self.n_emb, self.n_cont = n_emb, n_cont
        
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 100)
        self.lin2 = nn.Linear(100, 50)
        self.lin3 = nn.Linear(50, 2)
        
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(100)
        self.bn3 = nn.BatchNorm1d(50)
        
        self.emb_drop = nn.Dropout(0.2)
        self.drops = nn.Dropout(0.2)
        

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x) #categorical
        
        x2 = self.bn1(x_cont) #numerical
        x = torch.cat([x, x2], 1)
        
        x = self.lin1(x)
        x = F.relu(x) #layer 1
        x = self.drops(x)
        x = self.bn2(x)
        
        x = F.relu(self.lin2(x)) #layer 2
        x = self.drops(x)
        x = self.bn3(x)
        
        x = self.lin3(x) #final layer
        return x

In [27]:
model = DNN(embedding_sizes, len(cont_names)) # use this for DNN (MLP)
to_device(model, device);

#### Defining loss function and optimizer

In [28]:
def get_optimizer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optimizer

cross_entropy_loss = nn.CrossEntropyLoss()
lr=0.01 # learning rate 
wd=0.0 # weight decay
optimizer = get_optimizer(model, lr = lr, wd = wd)

### Training DNN

In [29]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

def train_model(model, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x1, x2, y in train_dl:
        batch = y.shape[0]
        output = model(x1, x2)
        loss = cross_entropy_loss(output, y)   
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total


def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        out = model(x1, x2)
        loss = cross_entropy_loss(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(out, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total


def train_loop(model, epochs):
    early_stopper = EarlyStopper(patience=3, min_delta=0)
    for i in range(epochs): 
        loss = train_model(model, train_dl)
        print("-Epoch: {} training loss: {}".format(i, loss))
        vloss, vaccu = val_loss(model, valid_dl)
        print()
        if early_stopper.early_stop(vloss):             
            break
        

train_loop(model, epochs=30)

-Epoch: 0 training loss: 0.6782486999034881
valid loss 0.633 and accuracy 0.631

-Epoch: 1 training loss: 0.6441751028597354
valid loss 0.627 and accuracy 0.648

-Epoch: 2 training loss: 0.6402642159163952
valid loss 0.624 and accuracy 0.632

-Epoch: 3 training loss: 0.6385622253641486
valid loss 0.618 and accuracy 0.656

-Epoch: 4 training loss: 0.6319846116378903
valid loss 0.617 and accuracy 0.648

-Epoch: 5 training loss: 0.631540622971952
valid loss 0.625 and accuracy 0.640

-Epoch: 6 training loss: 0.6302934797108173
valid loss 0.616 and accuracy 0.644

-Epoch: 7 training loss: 0.6241944078728556
valid loss 0.613 and accuracy 0.651

-Epoch: 8 training loss: 0.6234696809574962
valid loss 0.617 and accuracy 0.647

-Epoch: 9 training loss: 0.6264119763672352
valid loss 0.616 and accuracy 0.650

-Epoch: 10 training loss: 0.6265266970545054
valid loss 0.619 and accuracy 0.645



### Defining a simple Bayesian model

prior_mu (Float) is the mean of prior normal distribution.

prior_sigma (Float) is the sigma of prior normal distribution.

Rference:

https://jovian.ai/aakanksha-ns/shelter-outcome