# EEC 174AY Lab B2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from IPython.core.display import HTML
HTML("""
<style>
.text_cell_render p{
    font-size: 130%;
    line-height: 125%;
}
</style>
""")

## Outline

This lab will build your skills in utilizing LSTM networks so that you can apply deep learning to time series information

1. you will code an LSTM network and apply it to a pre-built codebase. Your focus will be on the ML coding
2. You will utilize a partially built code base and then finish it to detect ARDS in waveform data.

## LSTM Network

LSTM is a network that is able to utilize time series information and learn long term patterns to make more accurate predictions than a normal neural network would be able to. We show the general network architecture as an instruction for what you will need to code.

# <img src="/content/drive/MyDrive/Colab Notebooks/Lab_B2/The_LSTM_cell.png" width=55% height=auto\>

You will be applying LSTM to the task of patient ventilator asynchrony (PVA) detection. We have supplied a bit of the code you will need. Your jobs will be the following:

1. Code the `find_scaling_coefs`, `scale_breath`, and `pad_or_cut_breath` methods in the `PVADataset` class in `dataset.py`.
2. Code a simple 1 layer LSTM network based on network schematics given above. You are welcome to use other resource for assistance as well.
3. Run your LSTM model on PVA detection. How well does your model perform compared to your original Random Forest classifier? Why are you getting these results?
4. Code a more complex 3 layer LSTM network. Do additional layers improve results? Why/Why not?

For the math required we would advise you follow the [PyTorch LSTM mathematics](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)

In [None]:
from torch import nn
from torch.nn import functional as F

class LSTMNetwork(nn.Module):
    def __init__(self):
        super(LSTMNetwork, self).__init__()
        # Feel free to modify this
        lstm_hidden_units = 32
        # First layer is defined for you. Only have 2 input features (flow, pressure)
        self.ii = nn.Linear(2, lstm_hidden_units)
        # XXX TODO
        self.dummy = nn.Linear(2, lstm_hidden_units)
        self.s2 = nn.Linear(2, lstm_hidden_units)
        self.s3 = nn.Linear(2, lstm_hidden_units)
        self.t1 = nn.Linear(2, lstm_hidden_units)
        # Final layer is defined for you too. Have 3 potential output classes (normal, bsa, dta)
        self.final_classification = nn.Linear(lstm_hidden_units, 3)
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()

    def forward(self, x):
        # XXX code this up
        trial = self.dummy(x[:, 1, :])
        trial = self.sigmoid(trial)
        ct_next = torch.zeros_like(trial)
        for i in range(x.size(1)):
          r = x[:, i, :]
          ct_prev = ct_next
          sig1 = self.ii(r)
          sig1 = self.sigmoid(sig1)
          sig2 = self.s2(r)
          sig2 = self.sigmoid(sig2)
          sig3 = self.s3(r)
          sig3 = self.sigmoid(sig3)
          tan1 = self.t1(r)
          tan1 = self.tanh(tan1)
          ct_s1 = ct_prev * sig1
          s2_t1 = sig2 * tan1
          ct_next = ct_s1 + s2_t1
          ct_tan = self.tanh(ct_next)
          out = sig3 * ct_tan
          out = self.final_classification(out)
        return out

In [None]:
from pathlib import Path
import os
import pandas as pd
import numpy as np
from glob import glob
import torch
from torch import nn
from torch.autograd import Variable
from torch.optim import SGD
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score

from dataset import PVADataset

model = LSTMNetwork().cuda()
# You should modify the learning rate as suits the problem
optimizer = SGD(model.parameters(), lr=0.01)
bce = nn.BCEWithLogitsLoss()
batch_size = 16


def get_dataset(path, name):
    saved_set = Path(path)
    # Make sure we save previously processed data. This speeds up future processes.
    if saved_set.exists():
        dataset = pd.read_pickle(saved_set.resolve())
    else:
        # use a sequence length of 224 inputs. If you want to shorten this feel free.
        dataset = PVADataset(name, 224)
        dataset.process_dataset()
        pd.to_pickle(dataset, saved_set.resolve())
    return dataset

def get_all_datasets():
    training_set = get_dataset('pva_training_set.pkl', 'train')
    validation_set = get_dataset('pva_validation_set.pkl', 'val')
    testing_set = get_dataset('pva_testing_set.pkl', 'test')
    return training_set, validation_set, testing_set

def perform_training_epoch(train_loader):
    model.train()
    train_loss = 0.0
    train_corrects = 0
    train_total = 0

    for x, y in train_loader:
        x = x.float().cuda()
        y = y.float().cuda()
        optimizer.zero_grad()  # Zero the gradients

        output = model(x)
        loss = bce(output, y)
        loss.backward()
        optimizer.step()

        # Update training loss
        train_loss += loss.item() * x.size(0)

        # Calculate training accuracy
        binary_predictions = (output > 0.5).float()
        train_corrects += torch.sum(binary_predictions == y.data)
        train_total += y.size(0)

    # Calculate training accuracy
    train_accuracy = train_corrects / train_total
    train_loss /= len(train_loader.dataset)

    print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

    return train_loss, train_accuracy

def perform_inferencing(loader):
    model.eval()
    all_predictions = []
    all_targets = []
    total_loss = 0.0
    total_corrects = 0
    total_samples = 0

    with torch.no_grad():
        for x, y in loader:
            x = x.float().cuda()

            output = model(x)

            # Calculate loss
            loss = bce(output, y.float().cuda())
            total_loss += loss.item() * x.size(0)

            # Store predictions and ground truth labels
            all_predictions.append(output.cpu().numpy())
            all_targets.append(y.cpu().numpy())

            # Calculate accuracy
            binary_predictions = (output > 0.5).cpu().float()  # Move to CPU
            total_corrects += torch.sum(binary_predictions == y.data.cpu())
            total_samples += y.size(0)

    # Concatenate predictions and ground truth labels
    predictions = np.concatenate(all_predictions)
    targets = np.concatenate(all_targets)

    # Convert probabilities to binary predictions
    binary_predictions = (predictions > 0.5).astype(int)

    # Calculate F1 score
    f1 = f1_score(targets, binary_predictions, average='micro')

    # Calculate accuracy
    accuracy = total_corrects / total_samples

    # Calculate average loss
    average_loss = total_loss / len(loader.dataset)

    print(f"F1 Score: {f1:.4f}")
    return average_loss, accuracy

training_set, validation_set, testing_set = get_all_datasets()
# XXX make sure val and testing share same coefficients as training set!!

train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(testing_set, batch_size=batch_size, shuffle=False)
# You can write up the rest of the code here. We have already given you most of
# what you need to run the module yourself.

num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Training
    perform_training_epoch(train_loader)

    # Validation
    print("Validation Results")
    validation_loss, validation_accuracy = perform_inferencing(val_loader)
    print(f"Validation Loss: {validation_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}")

# Test the model on the testing set
print("Testing Results")
test_loss, test_accuracy = perform_inferencing(test_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Epoch 1/5
Training Loss: 0.5999, Training Accuracy: 2.0000
Validation Results
F1 Score: 0.0020
Validation Loss: 0.5794, Validation Accuracy: 2.0003
Epoch 2/5
Training Loss: 0.5254, Training Accuracy: 2.1912
Validation Results
F1 Score: 0.5700
Validation Loss: 0.5620, Validation Accuracy: 2.1400
Epoch 3/5
Training Loss: 0.5076, Training Accuracy: 2.3090
Validation Results
F1 Score: 0.5700
Validation Loss: 0.5603, Validation Accuracy: 2.1400
Epoch 4/5
Training Loss: 0.5019, Training Accuracy: 2.3090
Validation Results
F1 Score: 0.5700
Validation Loss: 0.5609, Validation Accuracy: 2.1400
Epoch 5/5
Training Loss: 0.4998, Training Accuracy: 2.3090
Validation Results
F1 Score: 0.5700
Validation Loss: 0.5619, Validation Accuracy: 2.1400
Testing Results
F1 Score: 0.4960
Test Accuracy: 1.9920


For this part of the assignment we use LSTM model to train the data. The F1 score and the accuracy are low because the LSTM predicts the future values based on the past values however in this case we use data from a single breath, which does not use the LSTM's advantage of predicting data.

## ARDS Detection

Regardless of whether you were successful on your last assignment, the design was to show you the internal mechanism about how LSTM works.

In this assignment you will utilize a dataset of ventilation data taken from 50 subjects. 25 subjects have ARDS, 25 subjects do not have ARDS. Your job is to extract waveform data, and utilize it to perform inferencing on whether the patient has ARDS or not.

1. Use basic CNN architecture to perform classification on whether patient has ARDS or not
2. Add LSTM to CNN architecture, do results improve? if not why? In this assignment you should use the [PyTorch LSTM layer.](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)

### Data

The data that we use here is ventilation data but it is structured a bit differently than the PVA dataset. Primarily, the data is structured in continuous breath sequences instead of single breaths. Here is an example.

<img src=ards-data.png width=50% height=auto\>

This has a few advantages:

1. We don't need padding anymore
2. It improves performance of our model

We stack 20 of these breaths together into a tensor that is in shape `(20, 1, 224)`. This allows us to analyze sequential breaths with an LSTM if we desire.

In [None]:
pip install ventmap

Collecting ventmap
  Downloading ventmap-1.5.3.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ventmap
  Building wheel for ventmap (setup.py) ... [?25l[?25hdone
  Created wheel for ventmap: filename=ventmap-1.5.3-py3-none-any.whl size=40312 sha256=ee0bd851f144d690f2d0cc6301a0edaafe821f5257f4bd34276f80f4482e22fd
  Stored in directory: /root/.cache/pip/wheels/6b/18/ac/0abd36110fb734afe3ba7c3e4a69a2c14f8022ee77ba30db13
Successfully built ventmap
Installing collected packages: ventmap
Successfully installed ventmap-1.5.3


In [None]:
from pathlib import Path
from copy import copy
from glob import glob
import math
import os
import re

import numpy as np
import pandas as pd
from scipy.signal import resample
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import DataLoader
from ventmap.raw_utils import read_processed_file

from dataset import ARDSDataset

batch_size = 32


def get_dataset(path, name):
    saved_set = Path(path)
    # Make sure we save previously processed data. This speeds up future processes.
    if saved_set.exists():
        dataset = ARDSDataset.from_pickle(saved_set.resolve())
    else:
        dataset = ARDSDataset(224, name, to_pickle=saved_set.resolve())
    return dataset


def get_all_datasets():
    training_set = get_dataset('ards_training_set.pkl', 'train')
    validation_set = get_dataset('ards_validation_set.pkl', 'val')
    testing_set = get_dataset('ards_testing_set.pkl', 'test')
    return training_set, validation_set, testing_set


training_set, validation_set, testing_set = get_all_datasets()
train_loader = DataLoader(training_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(testing_set, batch_size=batch_size, shuffle=False)


In [None]:
from torch import nn
from torch.optim import SGD
import torch
from torch.autograd import Variable
import torch.nn as nn
import re
import torch.nn.functional as F
import math
from collections import OrderedDict

from cnn_lstm_net import CNNLSTMNetwork
from densenet import densenet18

# You are welcome to evaluate other CNN backbones
cnn = densenet18()

# feel free to modify these parameters
lstm_hidden_units = 128
lstm_layers = 1

# 0 means there is 0% probability of dropout happening
lstm_dropout = 0

model = CNNLSTMNetwork(cnn, lstm_hidden_units, lstm_layers, lstm_dropout)

# We highly recommend using SGD for this problem
optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0001)
bce = nn.BCEWithLogitsLoss()


In [None]:
from sklearn.metrics import f1_score, accuracy_score
def perform_training_epoch(train_loader):
    model.train()
    train_loss = 0.0
    train_corrects = 0
    train_total = 0

    for data in train_loader:

        *_, y, z= data   # Unpack the inner tuple

        x = y.float().cuda()
        y = z.float().cuda()
        optimizer.zero_grad()  # Zero the gradients
        model.cuda()
        output = model(x)
        loss = bce(output, y)
        loss.backward()
        optimizer.step()

        # Update training loss
        train_loss += loss.item() * x.size(0)

        # Calculate training accuracy
        binary_predictions = (output > 0.5).float()
        train_corrects += torch.sum(binary_predictions == y.data)
        train_total += y.size(0)

    # Calculate training accuracy
    train_accuracy = train_corrects / train_total
    train_loss /= len(train_loader.dataset)

    print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

    return train_loss, train_accuracy


def perform_inferencing(loader):
    model.eval()
    all_predictions = []
    all_targets = []
    total_loss = 0.0
    total_corrects = 0
    total_samples = 0

    with torch.no_grad():
        for _, y, z in loader:
            x = y.float().cuda()

            output = model(x)

            # Calculate loss
            loss = bce(output, z.float().cuda())
            total_loss += loss.item() * x.size(0)

            # Store predictions and ground truth labels
            all_predictions.append(output.cpu().numpy())
            all_targets.append(z.cpu().numpy())

            # Calculate accuracy
            binary_predictions = (output > 0.5).cpu().float()
            total_corrects += torch.sum(binary_predictions == z.data.cpu())
            total_samples += z.size(0)

    # Concatenate predictions and ground truth labels
    predictions = np.concatenate(all_predictions)
    targets = np.concatenate(all_targets)

    # Convert probabilities to binary predictions
    binary_predictions = (predictions > 0.5).astype(int)

    # Calculate F1 score and accuracy
    f1 = f1_score(targets, binary_predictions, average='macro')
    accuracy = accuracy_score(targets, binary_predictions)

    # Calculate average loss
    average_loss = total_loss / len(loader.dataset)

    print(f"F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")
    return average_loss, accuracy


num_epochs = 5
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Training
    train_loss, train_accuracy = perform_training_epoch(train_loader)

    # Validation
    print("Validation Results")
    validation_loss, validation_accuracy = perform_inferencing(val_loader)
    print(f"Validation Loss: {validation_loss:.4f}, Validation Accuracy: {validation_accuracy:.4f}")

# Test the model on the testing set
print("Testing Results")
test_loss, test_accuracy = perform_inferencing(test_loader)
print(f"Test Accuracy: {test_accuracy:.4f}")

Epoch 1/5
Training Loss: 0.6896, Training Accuracy: 1.0000
Validation Results
F1 Score: 0.0000, Accuracy: 0.0000
Validation Loss: 0.6910, Validation Accuracy: 0.0000
Epoch 2/5
Training Loss: 0.6794, Training Accuracy: 0.9999
Validation Results
F1 Score: 0.0004, Accuracy: 0.0002
Validation Loss: 0.6900, Validation Accuracy: 0.0002
Epoch 3/5
Training Loss: 0.6645, Training Accuracy: 1.0115
Validation Results
F1 Score: 0.0008, Accuracy: 0.0004
Validation Loss: 0.6878, Validation Accuracy: 0.0004
Epoch 4/5
Training Loss: 0.6313, Training Accuracy: 1.1313
Validation Results
F1 Score: 0.0008, Accuracy: 0.0004
Validation Loss: 0.6870, Validation Accuracy: 0.0004
Epoch 5/5
Training Loss: 0.5557, Training Accuracy: 1.2869
Validation Results
F1 Score: 0.3501, Accuracy: 0.2709
Validation Loss: 0.6764, Validation Accuracy: 0.2709
Testing Results
F1 Score: 0.4565, Accuracy: 0.4272
Test Accuracy: 0.4272


For this part we are suppose to get a higher F1 scores and accuracy because we pass multiple breaths as the input, these help the LTSM remember when a breath is ARDS and when it is not which helps the model in predicting the outputs better, thereby increasing accuracy.