Import required Libraries

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wfdb
import ast

import time

import wfdb.processing
import wfdb.processing.evaluate
import wfdb.processing.qrs

Required Methods for data import

In [9]:
# Method defined by physionet to load data
def load_raw_data(df, sampling_rate, path):
    # Loading all data with signal and meta information
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
    
    # Eliminating meta information. We are selecting only signal value of 12 leads 
    data = np.array([signal for signal, meta in data])
    return data

In [10]:
base_path = r"D:\SynologyDrive\10_Arbeit_und_Bildung\20_Masterstudium\01_Semester\90_Projekt\10_DEV"
path = base_path + "/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/"

features_by_ecg_id = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
features_by_ecg_id.scp_codes = features_by_ecg_id.scp_codes.apply(lambda x: ast.literal_eval(x))

In [32]:
data_with_features_train = []
data_with_features_test = []
data_with_features_validation = []

# Set seed for reproducibility
np.random.seed(time.time_ns()%10000) # Set seed for reproducibility

for i in range(0, 100): # len(features_by_ecg_id)
    
    # Load raw data
    features_by_ecg_id_selected = features_by_ecg_id.iloc[i:i+1]
    raw_data_row_i = load_raw_data(features_by_ecg_id_selected, 500, path)[0]
    
    # Calculate the median lead of 12-lead-ecg
    median_lead = np.transpose(np.median(np.transpose(raw_data_row_i), axis=0))
    # Calculate the R-peaks
    rpeaks = wfdb.processing.xqrs_detect(median_lead, fs=500, verbose=False)

    # Generate feature vector and fill it with zeros, then fill it with 1 at the R-peak positions
    feature_rpeak = np.zeros(len(median_lead))
    feature_rpeak[rpeaks] = 1

    # Build Pandas DataFrame containing raw data and features
    df = pd.DataFrame({'raw_data': median_lead, 'feature_rpeak': feature_rpeak})

    # Use random number to define if the data is used for training or testing or validation
    random_number = np.random.rand()
    if random_number < 0.7:
        data_with_features_train.append(df)
    elif random_number >= 0.7 and random_number < 0.9:
        data_with_features_test.append(df)
    else:
        data_with_features_validation.append(df)

    enable_plot = False	
    if( enable_plot ):
        # Print with matplotlib
        plt.plot(median_lead)
        plt.plot(feature_rpeak)
        # Make the plot larger
        plt.gcf().set_size_inches(20, 10)
        plt.show()

# Convert List to DataFrame but segment it by using a column called group_id
pd_dataset_train = pd.concat(data_with_features_train)
pd_dataset_test = pd.concat(data_with_features_test)
pd_dataset_validation = pd.concat(data_with_features_validation)

# Delete variables that are not needed anymore
del data_with_features_train
del data_with_features_test
del data_with_features_validation

# Add a column to the DataFrame that segments the data into groups of 5000 samples
pd_dataset_train['group_id'] = np.repeat(np.arange(0, len(pd_dataset_train)/5000), 5000)
pd_dataset_test['group_id'] = np.repeat(np.arange(0, len(pd_dataset_test)/5000), 5000)
pd_dataset_validation['group_id'] = np.repeat(np.arange(0, len(pd_dataset_validation)/5000), 5000)

# Save the data to a file
pd_dataset_train.to_csv(base_path + "/data/pd_dataset_train.csv", index=False)
pd_dataset_test.to_csv(base_path + "/data/pd_dataset_test.csv", index=False)
pd_dataset_validation.to_csv(base_path + "/data/pd_dataset_validation.csv", index=False)