In [2]:
import numpy as np
import pandas as pd
import ast
import wfdb

## Section 1: Converting the raw signals for processing

**Note: Do not run this notebook unless you have put the PTB-XL database with the folder 'records100' in a folder named 'ptb_xl' under the 'data' folder. If you place the data anywhere else, modify the 'path' variable.**

This notebook converts the raw ECG signals, which are accessed as a 3-dimensional np.array, into 

1. A 3-dimensional np.array that is saved as type float16 to be fed into MTEX-CNN.

2. A dataframe with shape (21837000, 12) that is saved as a compressed gz folder.

In [2]:
#If sampling rate==100, load the low rate data, and if it's 500, load the high rate data
#Note this project only the 100Hz one.
def load_raw_data(df, sampling_rate, path):
    if sampling_rate == 100:
        data = [wfdb.rdsamp(path+f) for f in df.filename_lr]
    else:
        data = [wfdb.rdsamp(path+f) for f in df.filename_hr]
#Get the signal information only
    data = np.array([signal for signal, meta in data])
    return data

In [3]:
#Set path and sampling rate
path = '../data/ptb_xl/'
sampling_rate=100

In [4]:
# load and convert annotation data
meta_data_df = pd.read_csv(path+'ptbxl_database.csv', index_col='ecg_id')
meta_data_df.scp_codes = meta_data_df.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw signal data
raw_signals = load_raw_data(meta_data_df, sampling_rate, path)

Convert the raw_signals from float64 to float16

In [8]:
raw_signals = raw_signals.astype(np.float16)

Let's save the raw_signals both as an np.array and as a dataframe.

In [11]:
np.savez('../data/created_data_files/ptb_raw_signals', raw_signals)

In [5]:
#Rename the columns
names = ['patient', 'time', 'lead']

#Set a MultiIndex
index = pd.MultiIndex.from_product([range(s)for s in raw_signals.shape], names=names)

#Save it to a datframe
df = pd.DataFrame({'raw_signals': raw_signals.flatten()}, index=index)['raw_signals']

In [6]:
#Check df
df

patient  time  lead
0        0     0      -0.119
               1      -0.055
               2       0.064
               3       0.086
               4      -0.091
                       ...  
21836    999   7       0.291
               8       0.178
               9       0.106
               10      0.047
               11     -0.103
Name: raw_signals, Length: 262044000, dtype: float64

In [7]:
#Unstack and sort index
df = df.unstack(level='lead').sort_index()

In [8]:
#Check df again
df.tail()

Unnamed: 0_level_0,lead,0,1,2,3,4,5,6,7,8,9,10,11
patient,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
21836,995,-0.024,-0.016,0.008,0.019,-0.016,-0.003,0.069,0.135,0.073,0.024,-0.012,-0.044
21836,996,0.0,-0.001,-0.002,0.001,0.001,-0.001,0.071,0.148,0.082,0.034,-0.009,-0.085
21836,997,0.091,0.012,-0.079,-0.051,0.085,-0.034,0.069,0.199,0.12,0.063,0.016,-0.117
21836,998,0.175,0.022,-0.153,-0.099,0.164,-0.065,0.07,0.244,0.148,0.08,0.018,-0.108
21836,999,0.166,-0.007,-0.173,-0.08,0.17,-0.09,0.069,0.291,0.178,0.106,0.047,-0.103


In [9]:
#Check shape
df.shape

(21837000, 12)

In [12]:
#Save to csv as a compressed gz folder
df.to_csv('../data/created_data_files/ptb_signals_df.csv.gz', compression='gzip', index=False)