# Variational autoencoder

The variational autoencoder should be able to identify abnormal heartbeat patterns.

In [3]:
from mads_datasets.base import BaseDatastreamer
from mltrainer.preprocessors import BasePreprocessor
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
from torch import nn
import torch
import gin
from streamer import VAEstreamer
import vae
from loguru import logger

import sys
import datasets, metrics
import mltrainer
from mltrainer import ReportTypes, Trainer, TrainerSettings


In [11]:
   
# use the binary data for training the Variational Autoencoder
trainfileVAE = Path('../data/heart_train.parq').resolve()
testfileVAE = Path('../data/heart_test.parq').resolve()

# Remove outliers for training the VAE
traindatasetVAE = datasets.HeartDataset1D(trainfileVAE, target="target", outliersRemoval=True)
testdatasetVAE = datasets.HeartDataset1D(testfileVAE, target="target", outliersRemoval=True)

validationSetVAE = datasets.HeartDataset1D(testfileVAE, target="target", outliersRemoval=False)

trainstreamerVAE = VAEstreamer(traindatasetVAE, batchsize=32).stream()
teststreamerVAE = VAEstreamer(testdatasetVAE, batchsize=32).stream()
validationstreamerVAE = VAEstreamer(validationSetVAE, batchsize=32).stream()


In [13]:
gin.parse_config_file(Path('../src/config.gin').resolve())

X1, X2 = next(trainstreamerVAE)

encoder = vae.Encoder()
decoder = vae.Decoder()

latent = encoder(X1)
logger.info(f"the latent shape : {latent.shape}")

x = decoder(latent)
logger.info(f"the shape after: {x.shape}")

lossfn = vae.ReconstructionLoss()
loss = lossfn(x, X2)
logger.info(f"Untrained loss: {loss}")

logger.info(f"starting training for {100} epochs")
autoencoder = vae.AutoEncoder()

settings = TrainerSettings(
    epochs=100,
    metrics=[lossfn],
    logdir="logs",
    train_steps=200,
    valid_steps=200,
    reporttypes=[ReportTypes.TENSORBOARD],
    scheduler_kwargs={"factor": 0.5, "patience": 10},
)

trainer = Trainer(
    model=autoencoder,
    settings=settings,
    loss_fn=lossfn,
    optimizer=torch.optim.Adam,
    traindataloader=trainstreamerVAE,
    validdataloader=teststreamerVAE,
    scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau,
)
trainer.loop()

[32m2024-06-20 11:37:28.821[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mthe latent shape : torch.Size([32, 2])[0m
[32m2024-06-20 11:37:28.826[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mthe shape after: torch.Size([32, 192])[0m
[32m2024-06-20 11:37:28.830[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mUntrained loss: 11.769255638122559[0m
[32m2024-06-20 11:37:28.831[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mstarting training for 100 epochs[0m
[32m2024-06-20 11:37:28.836[0m | [1mINFO    [0m | [36mmltrainer.settings[0m:[36mcheck_path[0m:[36m61[0m - [1mCreated logdir /home/azureuser/code/nickyvanoorschot_mads_exam_24/notebooks/logs[0m
[32m2024-06-20 11:37:28.837[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m29[0m - [1mLogging to logs/20240620-113728[0m
[32m2024-06-20 11:37:29.271[0m | [1mI

In [None]:
import seaborn as sns
import numpy as np

y_true = []
y_pred = []

testdata = validationstreamerVAE.stream()
for _ in range(len(validationstreamerVAE)):
    X, y = next(testdata)
    yhat = autoencoder(X)
    yhat = yhat.argmax(dim=1) # we get the one with the highest probability
    y_pred.append(yhat.cpu().tolist())
    y_true.append(y.cpu().tolist())

yhat = [x for y in y_pred for x in y]
y = [x for y in y_true for x in y]

cfm = confusion_matrix(y, yhat)
# cfm = cfm / np.sum(cfm, axis=1, keepdims=True)

plot = sns.heatmap(cfm, annot=cfm, fmt=".3f")
plot.set(xlabel="Predicted", ylabel="Target")