In [1]:
pip install wfdb

Collecting wfdb
[?25l  Downloading https://files.pythonhosted.org/packages/b1/a0/922d06ec737e219a9f45545432842e68a84e8b52f292704056eea1d35e41/wfdb-3.1.1.tar.gz (113kB)
[K     |██▉                             | 10kB 18.2MB/s eta 0:00:01[K     |█████▊                          | 20kB 25.8MB/s eta 0:00:01[K     |████████▋                       | 30kB 19.5MB/s eta 0:00:01[K     |███████████▌                    | 40kB 18.5MB/s eta 0:00:01[K     |██████████████▍                 | 51kB 14.7MB/s eta 0:00:01[K     |█████████████████▎              | 61kB 14.4MB/s eta 0:00:01[K     |████████████████████▏           | 71kB 12.2MB/s eta 0:00:01[K     |███████████████████████         | 81kB 13.2MB/s eta 0:00:01[K     |██████████████████████████      | 92kB 13.1MB/s eta 0:00:01[K     |████████████████████████████▉   | 102kB 12.9MB/s eta 0:00:01[K     |███████████████████████████████▊| 112kB 12.9MB/s eta 0:00:01[K     |████████████████████████████████| 122kB 12.9MB/s 
Collecting

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math
from scipy.signal import argrelextrema
from sklearn.preprocessing import normalize

import wfdb

In [4]:
data_path = '/content/drive/MyDrive/Colab Notebooks/INCART12/I'

In [5]:
pts = [] #patients ids

for pt in range(1,76):
    if pt <10:
        pts.append('0'+str(pt))
    else:
        pts.append(str(pt))


Let's load all the annotations and see the distribution of heart beat types across all files.

In [6]:
df = pd.DataFrame()

for pt in pts:

    annotation = wfdb.rdann(data_path + pt, 'atr')
    sym = annotation.symbol
    
    values, counts = np.unique(sym, return_counts=True)
    df_sub = pd.DataFrame({'sym':values, 'val':counts, 'pt':[pt]*len(counts)})
    df = pd.concat([df, df_sub],axis = 0)

In [7]:
#Let's write a function for loading a single patient's signals and annotations. Note the annotation values are the indices of the signal array.

def load_ecg(file):

    record = wfdb.rdrecord(file)
    # load the annotation
    annotation = wfdb.rdann(file, 'atr')
    
    # extract the signal
    p_signal = record.p_signal
    
    # verify frequency is 257
    #assert record.fs == 257, 'sample freq is not 360'
    
    # extract symbols and annotation index
    atr_sym = annotation.symbol
    atr_sample = annotation.sample
    
    return p_signal, atr_sym, atr_sample

In [130]:
def make_dataset(num_cols,pts,lead, num_sec, fs):
    # function for making dataset ignoring non-beats
    # input:
    # a predefined width enough to capture any [R, R+1.2*T] interval
    # pts - list of patients
    # num_sec = number of seconds to include before and after the beat
    # fs = frequency (we take 257Hz as the data were collected accordingly)
    # output: 
    #   X_all = signal (nbeats , num_sec * fs columns)
    #   Y_all = beat annotation symbol (nbeats,1)
    
    # initialize numpy arrays
    
    X_all = np.zeros((1,num_cols))
    Y_all = []
    
    window_width = fs*num_sec
    
    for pt in pts:
        p_signal, atr_sym, atr_sample = load_ecg(data_path + str(pt))
        
        # grab a lead signal
        p_signal_lead = p_signal[:,lead]
        
        # exclude the nonbeats  
        #df_ann = pd.DataFrame({'atr_sym':atr_sym,
         #                       'atr_sample':atr_sample})
        #df_ann = df_ann.loc[df_ann.atr_sym.isin(abnormal + ['N'])]
            
        begin_window = 0 #point at which we start/end a window (of length num_sec seconds)

        while begin_window  < len(p_signal_lead):
            
            end_window = begin_window + window_width
            
            r_in_window = np.where((atr_sample >= begin_window) & (atr_sample < end_window)) #indices of R peaks in the window
            
            # we can quickly compute the segment size
            atr_sample_window = atr_sample[r_in_window] #the R peak points of that window
            segment_size = math.ceil(1.2*np.median(atr_sample_window[1:] - np.roll(atr_sample_window,1)[1:])) #interval width

            begin_window = end_window #we don't need begin_window for this loop anymore so we set it for the next loop

            X,Y = build_XY(p_signal_lead,segment_size,atr_sample,atr_sym,r_in_window, num_cols)
            
            X_all = np.append(X_all,X,axis = 0)
            Y_all = Y_all+Y
            
                
    # drop the first zero row
    X_all = X_all[1:,:]

    return X_all, Y_all 

######################


def build_XY(p_signal_lead,segment_size,atr_sample,atr_sym,r_in_window, num_cols):
    # this function builds the X,Y matrices for each beat
    # it also returns the original symbols for Y
    
    num_rows = len(r_in_window[0]) # that r_in_window is a tuple (ordered pair)
    
    X = np.zeros((num_rows, num_cols))
    Y = []

    for index in range(num_rows):
        
        r_in_window_index = r_in_window[0][index]
        
        left = atr_sample[r_in_window_index]
        right = left + segment_size
        
        step = len(p_signal_lead[left: right]) # is equal to segment_size until the remainder at the end of the window

        # put the [R,R+1.2*T] interval put as a row in X
        signal_vector = p_signal_lead[left: right]
        X[index,:step] =  (signal_vector - signal_vector.min())/(signal_vector.max() - signal_vector.min()) # you may chose not to make that scaling/shifting if your modeling puposes do require
        
        Y += [atr_sym[r_in_window_index]]


    return X,Y 

In [None]:
num_sec = 10
fs = 257
lead = 0
num_cols =440

X_all, Y_all = make_dataset(num_cols,pts,lead, num_sec, fs)

In [None]:
symm = np.array(Y_all)
symm = symm.reshape(symm.shape[0],1)

In [None]:
Table = np.hstack((X_all,symm))

In [None]:
dff = pd.DataFrame(Table)
dff.to_csv('INCART_Lead0_with_gains.csv', index = None)

If the code above takes forever (huge table), or crashes the RAM, do build mini portions of it (patient 1-10, 11-20,..) then glue them together by vstack. This is how we did it (written below at the end). Also this had the benefit to confirm that the code was not taking long due to any bugs.

In [119]:
num_sec = 10
fs = 257
lead = 11
num_cols = 500

X_all, Y_all = make_dataset(num_cols,pts[0:10],lead, num_sec, fs)



In [120]:
X_all2, Y_all2 = make_dataset(num_cols,pts[10:20],lead, num_sec, fs)

In [121]:
X_all3, Y_all3 = make_dataset(num_cols,pts[20:30],lead, num_sec, fs)

In [122]:
X_all4, Y_all4 = make_dataset(num_cols,pts[30:40],lead, num_sec, fs)

In [123]:
X_all5, Y_all5 = make_dataset(num_cols,pts[40:50],lead, num_sec, fs)

In [124]:
X_all6, Y_all6 = make_dataset(num_cols,pts[50:60],lead, num_sec, fs)

In [125]:
X_all7, Y_all7 = make_dataset(num_cols,pts[60:70],lead, num_sec, fs)

In [126]:
X_all8,  Y_all8 = make_dataset(num_cols,pts[70:],lead, num_sec, fs)

In [127]:
XX = np.vstack((X_all,X_all2))
XX = np.vstack((XX,X_all3))
XX = np.vstack((XX,X_all4))
XX = np.vstack((XX,X_all5))
XX = np.vstack((XX,X_all6))
XX = np.vstack((XX,X_all7))
XX = np.vstack((XX,X_all8))

In [None]:
YY = np.hstack((Y_all,Y_all2))
YY = np.hstack((YY,Y_all3))
YY = np.hstack((YY,Y_all4))
YY = np.hstack((YY,Y_all5))
YY = np.hstack((YY,Y_all6))
YY = np.hstack((YY,Y_all7))
YY = np.hstack((YY,Y_all8))

In [128]:
dfX = pd.DataFrame(XX,index = None)

In [129]:
dfX.to_csv('/content/drive/MyDrive/Colab Notebooks/INCART12/X_lead011.csv')

In [None]:
YYY= YY.reshape(YY.shape[0],1)

In [None]:
dfY = pd.DataFrame(YYY,index = None)
dfY.to_csv('/content/drive/MyDrive/Colab Notebooks/INCART12/Y_lead00.csv')

In [None]:
Table = np.hstack((XX,symmm))

In [None]:
df_shuffle.to_csv('INCART12_Lead0_shuffled.csv')