In [None]:
# imports
import h5py
import csv
import numpy as np
import os
import pandas as pd
import scipy


<h4>Data Loading</h4>

In [None]:
#Reading and combining the two csv files

# HDF5 file with exam_id and tracings
filename = "./data/exams_part0.hdf5"

# CSV file with exam_id, patient_Id and label
df_sample1 = pd.read_csv("./data/code15_chagas_labels.csv")

# main CSV file
df_sample2 = pd.read_csv("./data/exams.csv")

lista = ['exam_id','patient_id']

# csv file with exam_id, patient_id and label
df_master = pd.merge(df_sample2, df_sample1, on=lista, how='left')

# saving it as  a file
df_master.to_csv('data/exams_labels.csv', index=False)

<h4>Butterworth bandpass filter, shortening functions and resampling function</h4>

In [2]:
import torch
import scipy.signal
import numpy as np
from wfdb.processing import resample_sig
# Bandpass filter
def butter_bandpass_filter(data,lowcut, highcut, fs,order):

    nyquist_freq = 0.5 * fs
    low = lowcut / nyquist_freq
    high = highcut / nyquist_freq

    b,a = scipy.signal.butter(order, [low, high], btype='band')
    y = scipy.signal.filtfilt(b, a, data,axis=0)

    return y

def shorten_Code(signal):
    
    if signal.shape[0] > 4000:
        # riconverto il segnale in un tensore, utilizzo la copy per evitare errori con stripe negativa
        signal = torch.tensor(signal.copy())
        # taglio il segnale a 10 secondi che corrispondono a 4000 campioni a 400 Hz
        signal = torch.narrow(signal, 0, 0, 4000)
    return signal

def shorten_Sami(signal):
    
    if signal.shape[0] == 4096:
        
        start = 48
        end = 4048
        sliced_signal = signal[start:end,:]
    return sliced_signal

def resample(signal, fs, fs_new):
    try:
        resampled_signal = np.zeros((4000, signal.shape[1]))
        for i in range(signal.shape[1]):

            res_sig,_ = resample_sig(signal[:,i], fs, fs_new)
            resampled_signal[:,i] = res_sig
    except Exception as e:
        print(e)
    
    return resampled_signal
    

<h4>Code15% data</h4>

In [None]:
#Saving important information in a dictionary

import h5py
import csv
import numpy as np
import os
import pandas as pd
import scipy

# reading file HDF5 separing the two datasets
df_master = pd.read_csv('data/exams_labels.csv')
# pupulating the dictionary exam_ids_and_signals
exam_ids_to_chagas = dict()

# array with tuples
tuples= []

with open("data/merged.csv", newline="\n") as csvfile:
    
    reader = csv.DictReader(csvfile)
    for row in reader:
        
        exam_id = int(row['exam_id'])
        boolean = bool(row['chagas'])
        exam_ids_to_chagas[exam_id] = boolean

"""
    Reading the HDF5 files. Each signal is filtered
"""

files = ["./data/exams_part"+str(i)+".hdf5" for i in range(0,18)]

for filename in files:
    # if the file doesn not exist, skip it
    if (os.path.exists(filename) == False):
        continue

    print("Reading file: ", filename)

    with h5py.File(filename, "r") as ecgs:

        exam_ids = list(ecgs['exam_id'])
        num_exams = len(exam_ids)
        for i in range(num_exams):
            
            exam_id = exam_ids[i]
            signal = ecgs['tracings'][i]
            if exam_id not in exam_ids_to_chagas:
                
                continue
            else:
                
                filtered_signal = butter_bandpass_filter(filtered_signal, 0.5, 40, 400, 3)
                filtered_signal = shorten_Code(signal)
                tuples.append((filtered_signal, exam_ids_to_chagas[exam_id]))
for signal,_ in tuples:
    print(signal.shape)
print("Number of Exams:",len(exam_ids_to_chagas))
print("Number of Tuples: ", len(tuples))

Reading file:  ./data/exams_part0.hdf5
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(4000, 12)
(400

<h4>Sami-Trop</h4>

In [None]:
# every patient has Chagas
import csv
import h5py
import numpy as np


"""
    Reading the CSV file to get the exam_ids of the SAMI dataset.
"""
sami_exam_ids = []
with open("data/examsSAMI.csv", newline="\n") as csvfile:
    
    reader = csv.DictReader(csvfile)
    for row in reader:
        
        exam_id = int(row['exam_id'])
        sami_exam_ids.append(exam_id)

"""
    Reading the HDF5 file. Each signal is filtered and then sliced to 4000 samples (10 seconds).
"""
with h5py.File("data/examsSAMI.hdf5", "r") as ecgs:
    
    num_exams = len(sami_exam_ids)
    for i in range(num_exams):
        
        signal = butter_bandpass_filter(ecgs['tracings'][i], 0.5, 40, 400, 3)
        signal = shorten_Sami(signal)
        tuples.append((signal, True))


FileNotFoundError: [Errno 2] No such file or directory: 'data/examsSAMI.csv'

<h4>PTB-XL

In [None]:
import csv
import os
import wfdb
ptb_ecg_ids = [] 
ptb_ecg_and_labels = dict()

main_dir = "data/records500/"

# reading the csv file in order to get the exam_id and adding the label in the dictionary
with open("data/ptbxl_database.csv", newline="\n") as csvfile:
    
    reader = csv.DictReader(csvfile)
    for row in reader:
        
        ecg_id = row['ecg_id']
        if ecg_id not in ptb_ecg_ids:
            
            ptb_ecg_ids.append(ecg_id)
        if ecg_id not in ptb_ecg_and_labels:
            
            ptb_ecg_and_labels[ecg_id] = False

"""
    Reading the PTB-XL dataset. Each signal is filtered and then resampled to 400 Hz.
"""
for subfolder in sorted(os.listdir(main_dir)):
    
    subfolder_path = os.path.join(main_dir, subfolder)
    if os.path.isdir(subfolder_path):

        dat_files = [f for f in os.listdir(subfolder_path) if f.endswith('.dat')]
        for dat_file in dat_files:

            record_name = os.path.join(subfolder_path, dat_file[:-4]) 
            try:
                # Read the signal and header
                record = wfdb.rdsamp(record_name)
                # Extract signal and metadata
                signals, fields = record
                # Apply bandpass filter, resample and shorten the signal
                signals = butter_bandpass_filter(signals, 0.5, 40, 400, 3)
                signals = resample(signals, 500, 400)
                signals = shorten_Code(signals)
                tuples.append((signals, False))
            except Exception as e:

                print(f"Error reading {dat_file}: {e}")

print("Number of ECGS: ", len(tuples)) #83430

DataLoader

In [None]:
from torch.utils.data import DataLoader
from custom_dataset import FinalDataset
import h5py
import numpy as np

dataset = FinalDataset(tuples)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

#save data as 2 different datasets in hdf5 file:
# one dataset has all signals, while the other has all labels

with h5py.File("finalData.hdf5", "w") as f:
    
    signals = f.create_dataset("signals", (len(tuples), 4000, 12), dtype='f')
    labels = f.create_dataset("labels", (len(tuples),), dtype='i')
    
    for i in range(len(tuples)):
        signals[i] = tuples[i][0]
        labels[i] = int(tuples[i][1])

with h5py.File("finalData.hdf5", "r") as f:
    
    signals = f['signals']
    labels = f['labels']
    
    print(signals.shape)
    print(labels.shape)

(20000, 4000, 12)
(20000,)
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1