In [1]:
import os
from datetime import datetime
import warnings

import numpy as np
import pandas as pd
import scipy.io
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torchvision.transforms import transforms

warnings.filterwarnings('ignore')

C:\Users\marti\Anaconda3\envs\bakalarka\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\marti\Anaconda3\envs\bakalarka\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll


## Creating and Preprocessing the Data

In [2]:
cardio_dict = scipy.io.loadmat('cardio.mat')

In [3]:
cardio_dict

{'__header__': b'MATLAB 5.0 MAT-file, written by Octave 3.8.0, 2014-12-18 10:48:09 UTC',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[ 0.00491231,  0.69319077, -0.20364049, ...,  0.23149795,
         -0.28978574, -0.49329397],
        [ 0.11072935, -0.07990259, -0.20364049, ...,  0.09356344,
         -0.25638541, -0.49329397],
        [ 0.21654639, -0.27244466, -0.20364049, ...,  0.02459619,
         -0.25638541,  1.14001753],
        ...,
        [-0.41835583, -0.91998844, -0.16463485, ..., -1.49268341,
          0.24461959, -0.49329397],
        [-0.41835583, -0.91998844, -0.15093411, ..., -1.42371616,
          0.14441859, -0.49329397],
        [-0.41835583, -0.91998844, -0.20364049, ..., -1.28578165,
          3.58465295, -0.49329397]]),
 'y': array([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])}

In [4]:
X = cardio_dict['X']
y = cardio_dict['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1, stratify=y_train) # 0.25 x 0.8 = 0.2

In [5]:
print(f'Train set: {len(y_train)} examples')
print(f'Val set: {len(y_val)} examples')
print(f'Test set: {len(y_test)} examples')

Train set: 1098 examples
Val set: 366 examples
Test set: 367 examples


In [6]:
display(X_train.shape)

(1098, 21)

In [7]:
class CardioDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [8]:
train_data = CardioDataset(X_train, y_train)
val_data = CardioDataset(X_train, y_train)
test_data = CardioDataset(X_train, y_train)

In [9]:
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=False)
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=False)

## The Model

In [10]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encode = nn.Sequential(
            nn.Linear(21, 16),
            nn.ReLU(),
            nn.Linear(16, 10),
        )
        self.decode = nn.Sequential(
            nn.Linear(10, 16),
            nn.ReLU(),
            nn.Linear(16, 21),
        )
    
    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x

In [11]:
model = AutoEncoder()
learningRate=1e-3
nr_of_epochs = 20

optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)
criterion = nn.MSELoss()

In [20]:
def train_one_epoch():
    running_loss = 0.
    for i, data in enumerate(train_dataloader):
        inputs, labels = data
        optimizer.zero_grad()

        outputs = model(inputs.float())
        loss = criterion(outputs.float(), labels.float())
        loss.backward()
        optimizer.step()
        
        if (i + 1) % 100 == 0:
            print(f'Batch {i+1}')
        
        running_loss += loss.item()
    return running_loss / len(train_dataloader)

In [28]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
best_loss = 10**12
os.makedirs('best_models/', exist_ok=True)

for epoch in range(nr_of_epochs):
    model.train()
    train_loss = train_one_epoch()
    model.eval()

    valid_loss = 0.0
    for i, data in enumerate(val_dataloader):
        inputs, labels = data
        outputs = model(inputs.float())
        loss = criterion(outputs.float(), labels.float())
        valid_loss += loss
    valid_loss = valid_loss / len(val_dataloader)
    
    print(f'epoch {epoch}; train loss: {train_loss}; valid loss: {valid_loss}')
    if valid_loss < best_loss:
        best_loss = valid_loss
        model_path = 'best_models/model_{}_{}'.format(timestamp, epoch)
        torch.save(model.state_dict(), model_path)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


epoch 0; train loss: 0.0019100920671917265; valid loss: 0.0017081866972148418
epoch 1; train loss: 0.0018716645290624; valid loss: 0.001674265949986875
epoch 2; train loss: 0.0018444239989953025; valid loss: 0.0016234188806265593
epoch 3; train loss: 0.0017972943282479213; valid loss: 0.0015893912641331553
epoch 4; train loss: 0.0017442792102681575; valid loss: 0.0015431968495249748
epoch 5; train loss: 0.0017037030676066126; valid loss: 0.0014973324723541737
epoch 6; train loss: 0.0016498054707578074; valid loss: 0.0014605404576286674
epoch 7; train loss: 0.001624438477645324; valid loss: 0.0014235912822186947
epoch 8; train loss: 0.001574436714690819; valid loss: 0.0013821108732372522
epoch 9; train loss: 0.001534954257496995; valid loss: 0.0013422123156487942
epoch 10; train loss: 0.0014903038080673367; valid loss: 0.001302074291743338
epoch 11; train loss: 0.0014550174221060136; valid loss: 0.0012689492432400584
epoch 12; train loss: 0.0014000200740661563; valid loss: 0.00123225501