In [None]:
# imports
import h5py
import csv
import numpy as np
import os
import pandas as pd
import scipy


<h4>Data Loading</h4>

In [None]:
#Reading and combining the two csv files

# HDF5 file with exam_id and tracings
filename = "./data/exams_part0.hdf5"

# CSV file with exam_id, patient_Id and label
df_sample1 = pd.read_csv("./data/code15_chagas_labels.csv")

# main CSV file
df_sample2 = pd.read_csv("./data/exams.csv")

lista = ['exam_id','patient_id']

# csv file with exam_id, patient_id and label
df_master = pd.merge(df_sample2, df_sample1, on=lista, how='left')

# saving it as  a file
df_master.to_csv('data/exams_labels.csv', index=False)

<h4>Code15% data</h4>

In [None]:
#Saving important information in a dictionary

import h5py
import csv
import numpy as np
import os
import pandas as pd
import scipy

# reading file HDF5 separing the two datasets
df_master = pd.read_csv('data/exams_labels.csv')
# pupulating the dictionary exam_ids_and_signals
exam_ids_to_chagas = dict()

with open("data/merged.csv", newline="\n") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        exam_id = int(row['exam_id'])
        boolean = bool(row['chagas'])
        exam_ids_to_chagas[exam_id] = boolean

# list of files to read
files = ["./data/exams_part"+str(i)+".hdf5" for i in range(0,18)]

# populating the dictionary exam_ids_and_signals
exam_ids_and_signals = dict()

for filename in files:
    # if the file doesn not exist, skip it
    if (os.path.exists(filename) == False):
        continue

    print("Reading file: ", filename)

    with h5py.File(filename, "r") as ecgs:

        exam_ids = list(ecgs['exam_id'])
        num_exams = len(exam_ids)
        
        for i in range(num_exams):
            exam_id = exam_ids[i]
            
            if exam_id not in exam_ids_to_chagas:
                continue
            else:
                # add the exam id and the ECG signal to the dictionary
                exam_ids_and_signals[exam_id] = ecgs['tracings'][i]
print("Number of ECGS:",len(exam_ids_and_signals))
print("Number of Exams:",len(exam_ids_to_chagas))
# so at the end i got
# exam_ids_to_chagas that maps the exam id to Chagas disease label
# exam_ids_and_signals that maps the exam id to the ECG signal


<h4>Sami-Trop</h4>

In [None]:
# every patient has Chagas, so to have the label i just need to add a column and set it to true for every patient
import csv
import h5py
import numpy as np

sami_exam_ids = []
# reading the csv file in order to get the exam_id and adding the label in the dictionary
with open("data/examsSAMI.csv", newline="\n") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        exam_id = int(row['exam_id'])
        sami_exam_ids.append(exam_id)
        if exam_id not in exam_ids_to_chagas:
            exam_ids_to_chagas[exam_id] = True

# reading the HDF5 file and adding the signal to the dictionary
with h5py.File("data/examsSAMI.hdf5", "r") as ecgs:
    num_exams = len(sami_exam_ids)
    for i in range(num_exams):
        exam_id = sami_exam_ids[i]
        exam_ids_and_signals[exam_id] = ecgs['tracings'][i]

for exam_id, ecg in exam_ids_and_signals.items():
    if ecg.shape[0] == 4096:
        start = 48
        end = 4048
        sliced_signal = ecg[start:end,:]
        exam_ids_and_signals[exam_id] = sliced_signal
    print("Sliced signal shape: ", sliced_signal.shape)
print("Number of ECGS: ", len(exam_ids_and_signals))
print("Number of Exams: ", len(exam_ids_to_chagas))
        

<h4>PTB-XL

<h4>Butterworth bandpass filter</h4>

In [None]:
# Bandpass filter
def butter_bandpass_filter(data,lowcut, highcut, fs,order):

    nyquist_freq = 0.5 * fs
    low = lowcut / nyquist_freq
    high = highcut / nyquist_freq

    b,a = scipy.signal.butter(order, [low, high], btype='band')
    y = scipy.signal.filtfilt(b, a, data,axis=0)

    return y

# signal info for the bandpass filter
fs = 400
lowcut = 0.5
highcut = 40

filtered_signals = dict()

for exam_id, signal in exam_ids_and_signals.items():
    filtered_signal = butter_bandpass_filter(signal, lowcut, highcut, fs,3)
    filtered_signals[exam_id] = filtered_signal

print("Filtering done")
print("Number of Filtered ECGS:",len(filtered_signals))

<h4>Signal shortening</h4>

In [None]:
# Shortening the signals to 10 seconds if needed
import torch
exam_id_and_tuple = dict()

for exam_id, signal in filtered_signals.items():
    if exam_id in exam_ids_to_chagas:
        if (signal.shape[0] != 4000):
        # riconverto il segnale in un tensore, utilizzo la copy per evitare errori con stripe negativa
            signal = torch.tensor(signal.copy())

            signal = torch.narrow(signal, 0, 0, 4000)
        
        exam_id_and_tuple[exam_id] = (signal, exam_ids_to_chagas[exam_id])
        if signal.shape[0] != 4000:
            print("Male")

print("Numero di tuple (Chagas Label, ECG):",len(exam_id_and_tuple))