# Download dataset from kaggle
Files saved into data folder

In [5]:
import os
os.environ['KAGGLE_USERNAME'] = "madalingiurca" # username from the json file
os.environ['KAGGLE_KEY'] = "07e11257e5449820731503b6b0124cad" # key from the json file
!mkdir data

!kaggle datasets download -d shayanfazeli/heartbeat -p data/ 

!unzip -o data/heartbeat.zip -d data/ -x "*mitbih*"
!rm -rf data/*.zip

A subdirectory or file data already exists.
'kaggle' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.
'rm' is not recognized as an internal or external command,
operable program or batch file.


Import modules

In [3]:
!pip install pytorch_lightning -q
from typing import Union, List, Optional

from torch.utils.data import DataLoader, random_split, Dataset
from torch import nn
from pytorch_lightning import LightningDataModule, Trainer

import pandas as pd, numpy as np
import pytorch_lightning as pl
import torch
import torch.nn.functional as F

# Process data

In [4]:
df_normal = pd.read_csv('data/ptbdb_normal.csv', header=None, nrows=4000)
df_abnormal = pd.read_csv('data/ptbdb_abnormal.csv', header=None, nrows=4000)

''' concat the dataframes without shuffle'''

df = pd.concat([df_normal, df_abnormal])

''' split dataframe into train and test by 80/20 '''

msk = np.random.rand(len(df)) < 0.8

train_data = df[msk]
test_data = df[~msk]

''' write data to csv files'''

train_data.to_csv('data/ptbdb_train.csv', header=False, index=False)
test_data.to_csv('data/ptbdb_test.csv', header=False, index=False)
df.to_csv('data/ptbdb.csv', header=False, index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/ptbdb_normal.csv'

# Custom Dataset

In [None]:
class ECGDataset(Dataset):
    def __init__(self, dataPath):
        dataset = np.loadtxt(dataPath, delimiter=",", dtype=np.float64)
        self.no_samples = dataset.shape[0]

        self.x_data = torch.from_numpy(dataset[:, :-1]).float()
        self.y_data = torch.from_numpy(dataset[:, -1]).long()

    def __getitem__(self, item):
        return self.x_data[item], self.y_data[item]

    def __len__(self):
        return self.no_samples

class ECGDataModule(LightningDataModule):
    def __init__(self, data_folder="data", batch_size=64):
        super().__init__()
        self.batch_size = batch_size
        self.data_folder = data_folder

    def prepare_data(self, *args, **kwargs):
        df_normal = pd.read_csv(os.path.join(self.data_folder, 'ptbdb_normal.csv'), header=None, nrows=4000)
        df_abnormal = pd.read_csv(os.path.join(self.data_folder, 'ptbdb_abnormal.csv'), header=None, nrows=4000)
        df = pd.concat([df_normal, df_abnormal])
        df.to_csv('data/ptbdb.csv', header=False, index=False)

    def setup(self, stage: Optional[str] = None):
        """Loading the dataset"""
        print('-' * 3, "Loading Dataset", '-' * 3)

        dataset = ECGDataset("data/ptbdb.csv")
        lengths = [int(len(dataset) * 0.85), int(len(dataset) * 0.15)]
        self.trainDataset, self.testDataset = random_split(dataset, lengths, generator=torch.Generator())

        print('-' * 3, "Finished Loading", '-' * 3)

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        return DataLoader(self.trainDataset, batch_size=self.batch_size)

    def test_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        return DataLoader(self.testDataset, batch_size=self.batch_size)

# Neural Network Model - CNN

In [None]:
class CNN(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.learning_rate = 1e-3
        self.batch_size = None
        self.layer1 = nn.Sequential(
            nn.Conv1d(1, 5, kernel_size=13, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv1d(5, 10, kernel_size=10, stride=1, padding=1),
            nn.ReLU()
        )
        self.layer2 = nn.Sequential(
            nn.Conv1d(10, 20, kernel_size=4, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=5)
        )
        self.layer3 = nn.Sequential(
            nn.Conv1d(20, 50, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=2)
        )
        self.drop_out = nn.Dropout()
        self.layer4 = nn.Sequential(
            nn.Linear(800, 300),
            nn.Linear(300, 50),
            nn.Linear(50, 2)
        )

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = x.reshape(x.size(0), -1)
        x = self.drop_out(x)
        x = self.layer4(x)
        return x

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        x = x.view(x.size(0), 1, -1)
        y_pred = self.forward(x)
        loss = F.cross_entropy(y_pred, y)
        self.log('train_loss', loss)
        return loss

# Training

In [None]:

dataModule = ECGDataModule(batch_size=100)

model = CNN()
# '''tuning the trainer'''
trainer = Trainer(max_epochs=100, auto_lr_find=False, gpus=1, profiler=True)
trainer.tune(model, datamodule=dataModule)

# '''start training'''
trainer.fit(model, datamodule=dataModule)