In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

print("All packages imported successfully!")

In [None]:
# Check GPU device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# Logistic Regression using Deep Learning

## Proposed problem
For this activity we will create a prediction model to classify if the patient tend to have a heart condition or not. Implement a logistic regression using a basic neural network with ReLU and Sigmoid activation functions. It is important to understand the role of nonlinearity on neural network architectures. Build your model testing different numbers of neurons and hidden layers to achieve best results. This task should be done using 'pytorch'. For this activity, use the dataset from Kaegle listed on the Dataset section. Note that all columns shall be used (more explanetion of the dataset is avaliable on Kaggle). See the Support Material section for guidelines for studies.

Finally, write a report with the results. For this report use the LaTeX template present on this repo ('.tex' and '.bib' files).


## Dataset
- https://www.kaggle.com/ronitf/heart-disease-uci

## Support Material/References
### Logistic Regression:
	
- [Professor Andrew NG - Logistic Regression:](https://www.youtube.com/watch?v=-la3q9d7AKQ)
	
- [Pytorch Engineer - Pytorch:](https://www.youtube.com/watch?v=OGpQxIkR4ao&list=PLqnslRFeH2UrcDBWF5mfPGpqQDSta6VK4&index=8)
	 
- [Nonlinearities and activation functions:](https://www.deeplearningbook.com.br/funcao-de-ativacao/)

# 1 - Data preprocessing

In [None]:
# load the dataset
xy = pd.read_csv("_data/heart.csv")

# display dataset as a pandas dataframe
xy.head(10)

In [None]:
# separete feature (x) and labels (y)
x = xy.iloc[:,0:-1] # all columns but the last
y = xy.iloc[:,-1] # last column

In [None]:
# TODO: Verify if there is any missing data

# Your code here

In [None]:
# verify if data is imbalanced
num_0 = 0
num_1 = 0
for i in range(len(y)):
    if (y[i] == 1):
        num_1 += 1
    elif(y[i] == 0):
        num_0 += 1

if(num_0 != num_1):
    print("Data is imbalanced!")
else:
    print("Data is balanced!")

**Question** What is the difference between the training, validation and test datset? What each one is used for?

**Answer:**

In [None]:
# TODO: split into train, validation and test sets

''' hint: Use stratfy from sklearn 
(https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) 

We used stratify because the sample is imbalanced (the number of examples in each class is not equal).
Thus, stratification will help to ensure that training and test data have the same distribution of classes.
'''

# Your code here

In [None]:
# TODO: Normalize the input (x in [0,1])

# Your code here

# 2 - Define model

In [None]:
# TODO: Preprocess data to create dataloaders
class HeartDataset(Dataset):
    ''' Receive a dataset and preprocess to transform into data loaders
    
        Input:
            x_data (float numpy array): input values
            y_data (float numpy array): expected output value
    '''
    def __init__(self, x_data, y_data):
        # TODO: complete the generator
        
        # YOUR CODE HERE (delete pass after filling with your code)
        
        pass
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

train_dataset = HeartDataset(torch.from_numpy(x_train).float(), torch.from_numpy(y_train).long())
val_dataset = HeartDataset(torch.from_numpy(x_val).float(), torch.from_numpy(y_val).long())
test_dataset = HeartDataset(torch.from_numpy(x_test).float(), torch.from_numpy(y_test).long())

In [None]:
# To ensure that all of our mini-batches see all of our classes, we will need to over-sample
# the classes with few number of examples. For that, we will use the WeightRandomSampler.
# In this way, imblanced data can be handler and overfitting prevented.

target_list = [] # Contain all the outputs
for _, t in train_dataset:
    target_list.append(t)

target_list = torch.tensor(target_list) # Convert to tensor
target_list = target_list[torch.randperm(len(target_list))] # shuffling

In [None]:
# initialize with random weights to accelerate convergence

def get_class_distribution(obj):
    '''
    Input obj (data structure): receive the data regarding to labels
    
    Output: class as key and their count as values
    '''
    count_dict = {
        "rating_0": 0,
        "rating_1": 0,
    }
    
    for i in obj:
        if i == 0: 
            count_dict['rating_0'] += 1
        elif i == 1: 
            count_dict['rating_1'] += 1             
        else:
            print("Check classes.")
            
    return count_dict

class_count = [i for i in get_class_distribution(y_train).values()]
class_weights = 1./torch.tensor(class_count, dtype=torch.float)


class_weights_all = class_weights[target_list]
weighted_sampler = WeightedRandomSampler(weights=class_weights_all,
num_samples=len(class_weights_all), replacement=True)

In [None]:
# defining hyperparameters
num_epochs = 50
batch_size = 16
learning_rate = 1e-6

input_size = len(x.columns) # Number of features
num_classes = 2 # Number of classes (0 and 1)

**Question**: What is the difference between epochs, batch size and number of iterations?

**Answer:**

In [None]:
# TODO: building the dataloaders 
# hint: don't forget to use the weighted_sampler as sampler
# https://pytorch.org/docs/stable/data.html

# TODO: complete the dataloader (see https://pytorch.org/docs/stable/data.html)
train_loader = None # YOUR CODE HERE
val_loader = None # YOUR CODE HERE
test_loader = None # YOUR CODE HERE

In [None]:
# TODO: build the neural network architecture for linear regression
# hint: don't forget to add the sigmoid activation function at the output

class NeuralNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(NeuralNet, self).__init__()
        # TODO: complete with the Neural Network architecture (set up the linear layer)
        
        # YOUR CODE HERE (delete pass after filling with your code)
        
        pass
    
    def forward(self, x):
        # TODO: complete the forward method
        
        # YOUR CODE HERE (delete pass after filling with your code)
        
        pass

In [None]:
# TODO: implement the method
model = None # YOUR CODE HERE
model.to(device)

In [None]:
# Loss function and optimizer
 
# TODO: complete with the loss function (hint: use Binary Cross Entropy - BCE) 
criterion = None # YOUR CODE HERE

# TODO: complete with the optimizer (hint: use Adam Optimizer)
optimizer = None # YOUR CODE HERE

# 3 - Training the model

In [None]:
# Training blackbox
print("Begin training.")
for e in tqdm(range(1, num_epochs+1)):
    
    # monitoring variables 
    train_epoch_loss = 0
    train_epoch_acc = 0
    model.train()
    
    # TODO: TRAINING PIPELINE
    for X_train_batch, y_train_batch in train_loader:
        
        # alloacate to GPU or CPU
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        
        # clean the gradients
        optimizer.zero_grad()
        
        # TODO: coomplete the forward pass
        y_train_pred = None # Your code here
        
        # dimension fix
        y_train_batch= y_train_batch.type(torch.FloatTensor).to(device).unsqueeze(1)
        
        # clean the gradients
        optimizer.zero_grad()
        
        # TODO: complete the forward pass
        train_loss = None # Your code here
        
        # train accuracy metric calculation
        train_acc = accuracy_score(y_train_pred.cpu().detach().numpy().round(), y_train_batch.cpu().detach().numpy(), normalize=False)
        
        # TODO: complete the backward pass
        # Your code here 
        
        # TODO: complete the weight update
        # Your code here
        
        # updating loss and epoch accuracy values
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()


    # VALIDATION PIPELINE
    with torch.no_grad(): # do not use gradients here
        
        # monitoring variables
        val_epoch_loss = 0
        val_epoch_acc = 0
        
        # model evaluation
        model.eval()
        
        # validation iteration for each batch
        for X_val_batch, y_val_batch in val_loader:
            
            # alloacate to GPU or CPU
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            
            # forward pass
            y_val_pred = model(X_val_batch)
            
            # dimension fix
            y_val_batch = y_val_batch.type(torch.FloatTensor).to(device).unsqueeze(1)
            
            # loss calculation
            val_loss = criterion(y_val_pred, y_val_batch)
            
            # validation accuracy metric calculation
            val_acc = accuracy_score(y_val_pred.cpu().detach().numpy().round(), y_val_batch.cpu().detach().numpy(), normalize=False)
            
            # updating loss and epoch accuracy values
            val_epoch_loss += val_loss.item()
            val_epoch_acc += val_acc.item()

    
    print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_dataset):.3f}| Val Acc: {val_epoch_acc/len(val_dataset):.3f}')

# 3 - Testing the model

In [None]:
# Testing blackbox

# monitoring variable 
test_epoch_acc = 0

with torch.no_grad(): # do not use gradients here
    
    # model evaluation
    model.eval()
    
    # test iteration
    for X_batch, y_batch in test_loader:
        
        # alloacate to GPU or CPU
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        # compute the prediction using the trained model
        y_test_pred = model(X_batch)
        
        # test accuracy metric calculation
        test_acc = accuracy_score(y_test_pred.cpu().detach().numpy().round(), y_val_batch.cpu().detach().numpy(), normalize=False)
        
        # updating loss and epoch accuracy values
        test_epoch_acc += test_acc.item()
        
print(f' Number of samples: {len(test_loader)} Test Score: {test_epoch_acc:.3f} Test Acc: {test_epoch_acc/len(test_dataset):.3f}')