# Artificial neural network

In [1]:
#imports
from preprocessing.preprocessing import get_preprocessed_brfss_dataset
from preprocessing.neural_network_preprocessing import get_number_of_numerical_features, NeuralNetworkPreprocessor, CATEGORICAL_COLUMNS, NUMERICAL_COLUMNS
from visualization.general_plots import plot_class_frequencies
from visualization.neural_network_plots import plot_loss, plot_accuracy

from sklearn.metrics import fbeta_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
use_mps = True
use_cuda = False

device = torch.device("cpu")

if torch.has_mps and use_mps:
    device = torch.device("mps")
    print("Torch mps activated")

if torch.has_cuda and use_cuda:
    device = torch.device("cuda")
    print("Torch cuda activated")

Torch mps activated


Looking at the dataset it becomes clear that it is imbalanced

In [4]:
preprocessor = NeuralNetworkPreprocessor()
data_train, data_validation, data_test, target_train, target_validation, target_test = preprocessor.get_preprocessed_dataset_for_neural_network()
target_train.head(10)

Unnamed: 0,No,Yes
147853,1.0,0.0
257037,0.0,1.0
217190,1.0,0.0
166796,1.0,0.0
227386,0.0,1.0
43754,1.0,0.0
148571,1.0,0.0
83582,1.0,0.0
23302,1.0,0.0
211067,1.0,0.0


In [5]:
#constants
batch_size = 128
output_size = target_train.shape[1]
embedding_sizes = preprocessor.get_embedding_sizes()
embedding_input_size = sum((nf for ni, nf in embedding_sizes))
numerical_input_size = get_number_of_numerical_features()
input_size = embedding_input_size + numerical_input_size
print(f"Input size: {input_size}")
print(f"Output size: {output_size}")
print(f"Embedding sizes: {embedding_sizes}")

Input size: 50
Output size: 2
Embedding sizes: [(6, 3), (2, 1), (3, 2), (6, 3), (4, 2), (3, 2), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (2, 1), (8, 4), (3, 2), (13, 7), (5, 3), (4, 2), (2, 1), (4, 2), (2, 1), (2, 1), (4, 2), (2, 1)]


In [6]:
class CustomBrfssDataset(data_utils.Dataset):
    def __init__(self, dataset_complete, dataset_target):
        self.data_categorical = dataset_complete[CATEGORICAL_COLUMNS]
        self.data_categorical = torch.tensor(np.array(self.data_categorical)).int()

        self.data_numerical = dataset_complete[NUMERICAL_COLUMNS]
        self.data_numerical = torch.tensor(np.array(self.data_numerical)).float()

        self.target = torch.tensor(np.array(dataset_target)).float()

    def __len__(self):
            return len(self.data_categorical)

    def __getitem__(self, idx):
            return self.data_categorical[idx], self.data_numerical[idx], self.target[idx]

Create torch data loader

In [7]:
train_dataset = CustomBrfssDataset(data_train, target_train)
validation_dataset = CustomBrfssDataset(data_validation, target_validation)

# Custom weighted sampling required because otherwise the f2 score can be zero when all samples are from the same class, random sampling is not suitable
class_count_train = [target_train["No"].sum(),target_train["Yes"].sum()]
class_weights_train = 1./torch.tensor(class_count_train, dtype=torch.float)
class_weights_train_all = class_weights_train[np.where(target_train.to_numpy()==1)[1]]

class_count_validation = [target_validation["No"].sum(),target_validation["Yes"].sum()]
class_weights_validation = 1./torch.tensor(class_count_validation, dtype=torch.float)
class_weights_validation_all = class_weights_validation[np.where(target_validation.to_numpy()==1)[1]]

weighted_sampler_train = data_utils.WeightedRandomSampler(weights=class_weights_train_all, num_samples=target_train.shape[0], replacement=True)
weighted_sampler_validation = data_utils.WeightedRandomSampler(weights=class_weights_validation_all, num_samples=target_validation.shape[0], replacement=True)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, sampler=weighted_sampler_train)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, sampler=weighted_sampler_validation)

Create artificial net, define loss function and define optimizer

In [8]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
        self.batch_norm_numerical = nn.BatchNorm1d(numerical_input_size)
        self.dropout_embedding = nn.Dropout(0.3)

        self.fc1 = nn.Linear(input_size, 100)
        self.bn1 = nn.BatchNorm1d(100)
        self.do1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(100, 5)
        self.bn2 = nn.BatchNorm1d(5)
        self.do2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(5, output_size)

    def forward(self, input_categorical, input_numerical):
        embedding_layers = []
        for index,e in enumerate(self.embeddings):
            embedding_layers.append(e(input_categorical[:,index]))
        x_categorical = torch.cat(embedding_layers, 1)
        x_categorical = self.dropout_embedding(x_categorical)

        x_numerical = self.batch_norm_numerical(input_numerical)
        x = torch.cat([x_categorical, x_numerical], 1)

        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu_(x)
        x = self.do1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu_(x)
        x = self.do2(x)
        x = self.fc3(x)
        x = F.softmax(x)
        return x

In [9]:
def get_accuracy(model: Net, data_loader):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(data_loader, 0):
            inputs_categorical, inputs_numerical, labels = data
            inputs_categorical = inputs_categorical.to(device)
            inputs_numerical = inputs_numerical.to(device)
            labels = labels.to(device)

            outputs = model(inputs_categorical, inputs_numerical)
            _, predicted = torch.max(outputs.data, 1)
            _, labels = torch.max(labels.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct/total

def get_loss(model: Net, criterion, data_loader):
    running_loss = 0.0

    model.train()
    for i, data in enumerate(data_loader, 0):

        inputs_categorical, inputs_numerical, labels = data
        inputs_categorical = inputs_categorical.to(device)
        inputs_numerical = inputs_numerical.to(device)
        labels = labels.to(device)

        outputs = model(inputs_categorical, inputs_numerical)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        pass

    return running_loss/len(data_loader)

def get_f_score(model: Net, data_loader):
    running_predictions = []
    running_labels = []

    model.eval()
    for i, data in enumerate(data_loader, 0):

        inputs_categorical, inputs_numerical, labels = data
        inputs_categorical = inputs_categorical.to(device)
        inputs_numerical = inputs_numerical.to(device)
        labels = labels.to(device)

        outputs = model(inputs_categorical, inputs_numerical)
        _, predicted = torch.max(outputs.data, 1)
        _, labels = torch.max(labels.data, 1)
        running_predictions.extend(predicted.detach().cpu().numpy().ravel())
        running_labels.extend(labels.detach().cpu().numpy().ravel())

        pass

    return fbeta_score(running_labels, running_predictions, beta=2)



Training loop

In [10]:
def train_network(model: Net, criterion, optimizer, data_train_loader, data_validation_loader, n_epochs=5):
    loss_values = []
    accuracy_values = []
    loss_values_validation = []
    accuracy_values_validation = []
    f_scores = []
    f_scores_validation = []

    for epoch in range(n_epochs):

        running_loss = 0.0

        model.train()
        for i, data in enumerate(data_train_loader, 0):

            inputs_categorical, inputs_numerical, labels = data
            inputs_categorical = inputs_categorical.to(device)
            inputs_numerical = inputs_numerical.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            outputs = model(inputs_categorical,inputs_numerical)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 100 == 99:  # print every 100 batches
                print(f'[{epoch + 1}, {(i + 1) * batch_size}] loss: {running_loss / i:.3f}')

            pass

        loss_values.append(running_loss/len(data_train_loader))
        validation_loss = get_loss(model, criterion, data_validation_loader)
        train_accuracy = 100 * get_accuracy(model, data_train_loader)
        validation_accuracy = 100 * get_accuracy(model, data_validation_loader)
        accuracy_values.append(train_accuracy)
        accuracy_values_validation.append(validation_accuracy)
        loss_values_validation.append(validation_loss)

        f_train = get_f_score(net, data_train_loader)
        f_validation = get_f_score(net, data_validation_loader)
        f_scores.append(f_train)
        f_scores_validation.append(f_validation)

        print(f"Epoch {epoch} loss: {str(running_loss/len(data_train_loader))}")
        print(f"Epoch {epoch} validation loss: {validation_loss}")
        print(f'Train accuracy epoch {epoch}: {train_accuracy} %')
        print(f'Validation accuracy epoch {epoch}: {validation_accuracy} %')
        print(f"Train F2-score : {f_train}")
        print(f"Validation F2-score : {f_validation}")

    print('Finished Training')
    return loss_values, loss_values_validation, accuracy_values, accuracy_values_validation, f_scores, f_scores_validation

### Train model without under or oversampling

In [11]:
net = Net()
net.to(device)

criterion_cross_entropy = nn.CrossEntropyLoss()
optimizer_adam = optim.Adam(net.parameters(),lr=0.002)

loss_values, loss_values_validation, accuracy_values, accuracy_values_validation, f_scores, f_scores_validation = train_network(model=net, criterion=criterion_cross_entropy, optimizer=optimizer_adam, data_train_loader=train_loader, data_validation_loader=validation_loader, n_epochs=20)

[1, 12800] loss: 0.616
[1, 25600] loss: 0.589
[1, 38400] loss: 0.569
[1, 51200] loss: 0.554
[1, 64000] loss: 0.542
[1, 76800] loss: 0.533
[1, 89600] loss: 0.525
[1, 102400] loss: 0.518
[1, 115200] loss: 0.513
[1, 128000] loss: 0.507
[1, 140800] loss: 0.503
[1, 153600] loss: 0.500
[1, 166400] loss: 0.496
Epoch 0 loss: 0.49544092969912473
Epoch 0 validation loss: 0.4554537817158482
Train accuracy epoch 0: 87.21178100685685 %
Validation accuracy epoch 0: 87.21178100685685 %
Train F2-score : 0.0
Validation F2-score : 0.0
[2, 12800] loss: 0.460
[2, 25600] loss: 0.457
[2, 38400] loss: 0.455
[2, 51200] loss: 0.454


KeyboardInterrupt: 

In [None]:
plot_loss(loss_values)

In [None]:
plot_accuracy(accuracy_values)

### Train model with undersampling

In [None]:
data_train, data_test, target_train, target_test = preprocessor.get_preprocessed_dataset_for_neural_network_undersampled()

train_dataset = CustomBrfssDataset(data_train, target_train)
test_dataset = CustomBrfssDataset(data_test, target_test)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

net = Net()
net.to(device)

criterion_cross_entropy = nn.CrossEntropyLoss()
optimizer_adam = optim.Adam(net.parameters(),lr=0.002)

loss_values, accuracy_values = train_network(model=net, criterion=criterion_cross_entropy, optimizer=optimizer_adam, data_loader=train_loader)