In [None]:
import numpy as np
import os
import wfdb
from wfdb import rdrecord
import datetime
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from CustomStatisticCalculation import *
from SubModeles.EcgDataset import ECGDataSetItem
from scipy.io import loadmat
from scipy.signal import resample
from array_gzip_io_utils import save_array_as_gz_file
from ecg_classification_helpers import generate_knowlage_integration

In [None]:
PART_SIZE = 8000
EXPERIMENT = "1"
KNOWLEDGE_WINDOWS_SIZE = 260
new_sampling_rate = 400
TEST_SET_SIZE = 0.2

ROOT_DESTINATION_PATH = f"TempData/Data/{PART_SIZE}/Test-set-{TEST_SET_SIZE}/{EXPERIMENT}"
ROOT_DATABASES_PATH = "..\..\ECG\Databases"
os.makedirs(ROOT_DESTINATION_PATH, exist_ok=True)

In [None]:
def get_index_time(index, fs):
    time_ms = index * (1000 / fs)

    time = datetime.datetime.utcfromtimestamp(time_ms/1000.0)
    time_string = time.strftime('%H:%M:%S.%f')
    return time_string

def resample_ecg(signal, indexes, original_sampling, target_sampling):
    resampling_factor = target_sampling / original_sampling
    new_peak_indexes =np.asarray((indexes * resampling_factor).astype(int))
    
    new_length = int(np.ceil(len(signal) * resampling_factor))
    resampled_signal = resample(signal, new_length)
    return resampled_signal, new_peak_indexes

def split_signal_to_dataset_items(signal, signal_name, annotation_indexes, annotation_names, supported_annotations, sampling_rate):
    dataset_items = []
    if not np.all(np.isfinite(signal)):
        return dataset_items

    if sampling_rate != new_sampling_rate:
        signal, annotation_indexes = resample_ecg(signal, annotation_indexes, sampling_rate, new_sampling_rate)

    for index in range(0, len(signal), PART_SIZE):
        normalized_signal = minmax_scale(np.array(signal[index:index+PART_SIZE],dtype=np.float32))
        if len(normalized_signal) < PART_SIZE:
            continue
        
        part_targets = [0] * PART_SIZE
        part_origin_r_peack = []
        part_annotation_labels = []
        part_annotation_indexs = np.where(np.logical_and(annotation_indexes>=index, annotation_indexes < PART_SIZE+index))
        for annotation_index in part_annotation_indexs[0]:
            annotation_symbol = annotation_names[annotation_index]
            if annotation_symbol not in supported_annotations:
                continue

            part_annotation_labels.append(annotation_symbol)

            part_targets[annotation_indexes[annotation_index] - index] = 1
            part_origin_r_peack.append(annotation_indexes[annotation_index] - index)

        part_origin_r_peack = np.array(part_origin_r_peack)
        if not part_origin_r_peack.any():
            continue

        knowledge_part = generate_knowlage_integration(normalized_signal, KNOWLEDGE_WINDOWS_SIZE)
        signal_start_time = get_index_time(index, new_sampling_rate)
        dataset_items.append(ECGDataSetItem(normalized_signal, np.array(part_targets, dtype=np.float32), knowledge_part, part_origin_r_peack, np.array(part_annotation_labels), sampling_rate, signal_name, signal_start_time))

    return dataset_items

In [None]:
# MIT arrhythmia dataset load
class_mapper = { 'N': 0, 'V': 1, '/': 2, 'R': 3, 'L': 4, 'A': 5, '!': 6, 'E': 7, 'j': 8, 'a': 9, 'f': 10, 'F': 11, 'r': 12, 'J': 12, 'S': 12}

signals_to_skip = ["108", "207"]
LEAD_PRIORITIES = ["MLII", "II", "V2"]

mit_db_path = os.path.join(ROOT_DATABASES_PATH, "MIT\mit-bih-arrhythmia-database-1.0.0")

dataset_items = []
knowledge_window_number = math.ceil(PART_SIZE/KNOWLEDGE_WINDOWS_SIZE)
with open(os.path.join(mit_db_path, "RECORDS")) as file:
    for line in file:
        line = line.strip()
        if len(line) == 0 or line in signals_to_skip:
            continue

        signal_path =os.path.join(mit_db_path, line)
        record = rdrecord(signal_path)
        ann = wfdb.rdann(signal_path, extension="atr")
        sampling_rate = record.fs

        lead_index = 0
        sig_name = record.sig_name[lead_index]
        signal = record.p_signal.T[lead_index]

        for lead in LEAD_PRIORITIES:
            if lead in record.sig_name:
                lead_index = record.sig_name.index(lead)
                sig_name = record.sig_name[lead_index]
                signal = record.p_signal.T[lead_index]

                new_dataset_items = split_signal_to_dataset_items(signal, record.record_name, ann.sample, ann.symbol, class_mapper, sampling_rate)
                dataset_items.extend(new_dataset_items)
                break

np.random.shuffle(dataset_items)
train_data, test_data = train_test_split(dataset_items, test_size=TEST_SET_SIZE, random_state=42)

print(f"Train set length: {len(train_data)}")
print(f"Test set length: {len(test_data)}")

save_array_as_gz_file(f'{ROOT_DESTINATION_PATH}/train-mit-arrhythmia-fs-{new_sampling_rate}-prefered-leads.pkl.gz', train_data)
save_array_as_gz_file(f'{ROOT_DESTINATION_PATH}/test-mit-arrhythmia-fs-{new_sampling_rate}-prefered-leads.pkl.gz', test_data)
del train_data, test_data

In [None]:
# QT dataset load
supported_peak_annotation = ['A', 'S', 'V', 'F', 'a', 'Q', 'N', 'e', '/', 'j', 'R', 'J', 'f']

database_folder = os.path.join(ROOT_DATABASES_PATH, "qt-database-1.0.0")
with open(os.path.join(database_folder, "RECORDS_Origin+lead")) as file:
    signal_settings =[]
    for setting in file:
        splited_settings = setting.split(",")
        if len(splited_settings) == 3 and splited_settings[1]:
            signal_settings.append([os.path.join(database_folder, splited_settings[0]), splited_settings[1],splited_settings[2].strip()])

dataset_items = []
all_annotations = []
knowledge_window_number = math.ceil(PART_SIZE/KNOWLEDGE_WINDOWS_SIZE)
for signal_setting in signal_settings:
    signal_path, signal_lead, annotation_ext = signal_setting
    record = rdrecord(signal_path)
    sampling_rate = record.fs

    if signal_lead in record.sig_name:
        lead_index = record.sig_name.index(signal_lead)

        # if os.path.exists(os.path.join(database_folder, f"{record.record_name}.atr")):
        #     ann = wfdb.rdann(signal_path, extension=f"atr")
        # else:
        #     ann = wfdb.rdann(signal_path, extension=f"pu{lead_index}")

        ann = wfdb.rdann(signal_path, extension=annotation_ext)
        sig_name = record.sig_name[lead_index]
        signal = record.p_signal.T[lead_index]

        new_dataset_items = split_signal_to_dataset_items(signal, record.record_name, ann.sample, ann.symbol, supported_peak_annotation, sampling_rate)
        dataset_items.extend(new_dataset_items)

np.random.shuffle(dataset_items)

train_data, test_data = train_test_split(dataset_items, test_size=TEST_SET_SIZE, random_state=42)

print(f"Train set length: {len(train_data)}")
print(f"Test set length: {len(test_data)}")

save_array_as_gz_file(f'{ROOT_DESTINATION_PATH}/train-qt-fs-{new_sampling_rate}-prefered-leads.pkl.gz', train_data)
save_array_as_gz_file(f'{ROOT_DESTINATION_PATH}/test-qt-fs-{new_sampling_rate}-prefered-leads.pkl.gz', test_data)
del train_data, test_data

In [None]:
# ChinaPhysiologicalSignalChallenge2020 dataset load
knowledge_window_number = math.ceil(PART_SIZE/KNOWLEDGE_WINDOWS_SIZE)

dataset_folder_path = os.path.join(ROOT_DATABASES_PATH, "ChinaPhysiologicalSignalChallenge2020")
patient_number = 10
sampling_rate = 400
dataset_items = []

for patient in range(1, patient_number + 1):
    signal_file_path = os.path.join(dataset_folder_path, f"A{str(patient).zfill(2)}.mat")
    annotation_file_path = os.path.join(dataset_folder_path, f"RPN_{str(patient).zfill(2)}.mat")

    print(f"Start processing ecg {patient}", end="\r")
    signal = loadmat(signal_file_path)
    signal = np.asarray(signal['ecg'], dtype=np.float32)
    signal = np.squeeze(signal)

    ecg_annotation = loadmat(annotation_file_path)
    ecg_annotation = np.asarray(ecg_annotation['R'], dtype=np.int32)
    ecg_annotation = np.squeeze(ecg_annotation)

    annotaiton_labels = ['R'] * len(ecg_annotation)

    new_dataset_items = split_signal_to_dataset_items(signal, patient, ecg_annotation, annotaiton_labels, ['R'], sampling_rate)
    dataset_items.extend(new_dataset_items)

np.random.shuffle(dataset_items)

train_data, test_data = train_test_split(dataset_items, test_size=TEST_SET_SIZE, random_state=42)

print(f"Train set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")

save_array_as_gz_file(f'{ROOT_DESTINATION_PATH}/train-china-signal-chalenge-2020-fs-{new_sampling_rate}-prefered-leads.pkl.gz', train_data)
del train_data

save_array_as_gz_file(f'{ROOT_DESTINATION_PATH}/test-china-signal-chalenge-2020-fs-{new_sampling_rate}-prefered-leads.pkl.gz', test_data)
del test_data

In [None]:
# Poland
"""
(C) Luis Howell <2123374H@student.gla.ac.uk>
(C) 2018 Bernd Porr <bernd.porr@glasgow.ac.uk>

GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007

API for the data which loads, filters and exports
the ECG data.
"""

import numpy as np
import scipy.signal as signal


# Class which loads the dataset
class Ecg:

    experiments = ["sitting","maths","walking"]
    #experiments = ["sitting","maths","walking","hand_bike","jogging"]
    fs=250
    total_subjects = 25
    
    def __init__(self,root_dir,_subj,_experiment):
        """ Constructor: Specify the path to the data, the subject number and the experiment"""
        self.subj = _subj
        self.experiment = _experiment
        self.subjdir = root_dir+"/"+("subject_%02d" % _subj)+"/"
        self.expdir = self.subjdir+self.experiment+"/"

        self.data=np.loadtxt(self.expdir+"ECG.tsv")
        try:
            self.anno_cs=np.loadtxt(self.expdir+"annotation_cs.tsv", dtype=int)
            self.anno_cs_exists=True 
        except:
            self.anno_cs_exists=False           
        try:
            self.anno_cables=np.loadtxt(self.expdir+"annotation_cables.tsv", dtype=int)
            self.anno_cables_exists=True 
        except:
            self.anno_cables_exists=False   
           
        self.cs_V2_V1 = self.data[:, 0]
        self.einthoven_II = self.data[:, 1]
        self.einthoven_III = self.data[:, 2]
        self.einthoven_I = self.einthoven_II - self.einthoven_III
        self.acc_x = self.data[:, 3]
        self.acc_y = self.data[:, 4]
        self.acc_z = self.data[:, 5]

        self.T=1/self.fs
        self.t = np.linspace(0, self.T*len(self.cs_V2_V1), len(self.cs_V2_V1))


    def filter_data(self):
        """Filters the ECG data with a highpass at 0.1Hz and a bandstop around 50Hz (+/-2 Hz)"""

        b_dc, a_dc = signal.butter(4, (0.1/self.fs*2), btype='highpass')
        b_50, a_50 = signal.butter(4, [(48/self.fs*2),(52/self.fs*2)], btype='stop')

        self.cs_V2_V1_filt = signal.lfilter(b_dc, a_dc, self.cs_V2_V1)
        self.cs_V2_V1_filt = signal.lfilter(b_50, a_50, self.cs_V2_V1_filt)

        self.einthoven_II_filt = signal.lfilter(b_dc, a_dc, self.einthoven_II)
        self.einthoven_II_filt = signal.lfilter(b_50, a_50, self.einthoven_II_filt)

        self.einthoven_III_filt = signal.lfilter(b_dc, a_dc, self.einthoven_III)
        self.einthoven_III_filt = signal.lfilter(b_50, a_50, self.einthoven_III_filt)

        self.einthoven_I_filt = self.einthoven_II_filt-self.einthoven_III_filt

        return

sampling_rate = 250
data_path = os.path.join(ROOT_DATABASES_PATH, "ECG_University_of_Glasgow\dataset_716\experiment_data")
dataset_items = []
knowledge_window_number = math.ceil(PART_SIZE/KNOWLEDGE_WINDOWS_SIZE)

for subject_index in range(25):
    print(f"Processing subject {subject_index}", end="\r")

    for experiment in Ecg.experiments:
        ecg_class = Ecg(data_path, subject_index, experiment)
        ecg_signal = ecg_class.einthoven_II

        if ecg_class.anno_cables_exists:
            ecg_annotation = ecg_class.anno_cables
        else:
            continue

        annotaiton_labels = ['R'] * len(ecg_annotation)

        new_dataset_items = split_signal_to_dataset_items(ecg_signal, f"{subject_index}-{experiment}", ecg_annotation, annotaiton_labels, ['R'], sampling_rate)
        dataset_items.extend(new_dataset_items)

np.random.shuffle(dataset_items)
save_array_as_gz_file(f'{ROOT_DESTINATION_PATH}/university-of-glasgow-2020-fs-{new_sampling_rate}-prefered-leads.pkl.gz', dataset_items)