In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
import os
import wfdb
from scipy.signal import medfilt
from sklearn.preprocessing import OneHotEncoder

In [41]:
print(os.getcwd())

/Users/vegy-math808y/Speciale/masters


# preprocessing the AFDB dataset for deep learning algorithm

In [3]:
#patients = ['04043']

patients = ['04015','04043','04048','04126','04746','04908',
            '04936','05091','05121','05261','06426','06453','06995','07162',
            '07859','07879','07910','08215','08219','08378','08405','08434',
            '08455']
# id '00735' and '03665' is not included since no raw ECG values (.dat file) exists 

However, some of these symbols represent the "non-beat" beats and we need to filter them out!

For more information about the beat types, please see the links below: 

https://archive.physionet.org/physiobank/database/html/mitdbdir/tables.htm#testbeats


https://archive.physionet.org/physiobank/database/html/mitdbdir/intro.htm#symbols

In [3]:
non_beat = ['[','!',']','x','(',')','p','t','u','`','\'','^','|','~','+','s','T','*','D','=','"','@','Q','?'
            ,'L','R','V','/','f','F','j','a','E','J','e','S']
#abnormal_beat = ['L','R','V','/','A','f','F','j','a','E','J','e','S']

afib_beat = ['A']


Load the ECG signal and annotation file for a single patient. The ``read_ecg`` function gets a path to the downloaded dataset and returns the signal values/amplitudes, annotation/label type, and the location of the annotations/labels.

sampling frequency of the AFDB ECG records is 250 Hz.




In [4]:
def read_ecg(path):
    
    # Read ECG signal
    record = wfdb.rdrecord(path)
    
    # Get the ECG signal
    ecg_sig = record.p_signal
    # Since there are two leads of ECGs for each record, we only select the first lead to work with.
    ecg_sig = ecg_sig[:,0]
    
    # Read corresponding annotation file
    #annot = wfdb.rdann(path, 'atr')
    
    ann = wfdb.rdann(path, 'atr')
    QRS = wfdb.rdann(path, 'qrs')
   
    #get RPeak location (index) 
    annot_sample = pd.Series(QRS.sample)
    
    #extract and assign labels to individual beats. 
    #Needed since beat types are only defined in intervals in .atr file
    Symb = pd.Series(ann.symbol)
    Samp = pd.Series(ann.sample)
    Rhythm = pd.Series(ann.aux_note)    

    df1 = pd.DataFrame({'Rpeak': QRS.sample})
    df1["label"]=np.nan
    ##print(df1.head(10))
    df2 = pd.DataFrame({'Rpeak': ann.sample,"label": ann.aux_note})
    ##print("dataframe2")
    ##print(df2.head(10))
    newdf = pd.concat([df1,df2], keys = ['1', '2'])
    newdf=newdf.sort_index(ascending=False) 
    #make sure that values from annotation file is on top even if Rpeaks are identical. Important for ffill
    newdf=newdf.sort_values(by=['Rpeak'])
    #perform forward ffill to get annotation to all Rpeaks in QRS.sample
    newdf=newdf.ffill()
    #discard all Rpeaks+Annotations from ann.sample. Avoid adding extra or duplicate beats 
    newdf=newdf.loc[['1']]
    
    #convert label col to series for further processing 
    annot_symbol = pd.Series(list(newdf["label"]))
    
    annot_symbol = annot_symbol.replace('(AFIB', 'A')
    annot_symbol = annot_symbol.replace('(N', 'N')
    
    return ecg_sig, annot_symbol, annot_sample

### Filter using 2 consecutive moving median filters to obtain baseline which is the subtracted from raw signal. Used to remove baseline wander


In [22]:
from scipy.signal import medfilt

ecg_sig1 = ecg_sig[:1000]
sampfrq=250

#Define first window (200ms) - must be odd. therefore subtract 1
window200 = int(200*sampfrq/1000)-1 
#Define second window (600ms)
window600 = int(600*sampfrq/1000)-1
med1 = medfilt(ecg_sig1, kernel_size=window200)
med2 = medfilt(med1, kernel_size=window600)
corr = ecg_sig1-med2

##plot result


plt.subplot(2, 1, 1)
plt.title('Raw')
plt.plot(ecg_sig1)
plt.subplot(2, 1, 2)
plt.subplots_adjust(hspace=0.5)
plt.title('Corrected')
plt.plot(corr, color='red')
plt.show()


NameError: name 'ecg_sig' is not defined

In [4]:
#make filter as function to be called pr patient

def bw_filt(ecg_sig, sampfrq):
    #Define first window (200ms) - must be odd. therefore subtract 1
    window200 = int(200*sampfrq/1000)-1 
    #Define second window (600ms)
    window600 = int(600*sampfrq/1000)-1
    med1 = medfilt(ecg_sig, kernel_size=window200)
    med2 = medfilt(med1, kernel_size=window600)
    corr = ecg_sig-med2
    #return corrected signal
    return corr


In [5]:
#make lowpass filter function
#inspired by https://medium.com/analytics-vidhya/how-to-filter-noise-with-a-low-pass-filter-python-885223e5e9b7

from scipy.signal import butter,filtfilt

def butter_lowpass_filter(data, cutoff, sampfrq):
    # Get the filter coefficients 
    #order = 2       # sin wave can be approx represented as quadratic
    #order = 4 according to https://www.sciencedirect.com/science/article/pii/S0026269219306330
    b, a = butter(4, cutoff, btype='low', analog=False, fs=sampfrq)
    y = filtfilt(b, a, data)
    return y


1. The ``build_dataset`` function gets the list of patients (``patients``), the time interval (``interval``) before and after each heart peak ($\pm$3 seconds for example), sampling frequency (``fs``), which is 250 Hz for AFDB dataset, and the list of abnormal beats symbols (``abnormal_beat``) as defined earlier.



2. The ``build_dataset`` function returns two matrices called ``X`` and ``Y``, which are extracted heartbeats and their coresponding labels (**normal** or **abnormal**), respectively.  It also returns the annotation symbol (``annot_symb``) for each individual extracted beat. It should be mentioned that matrix X rows and columns represent the number of beats and the values of beats (i.e. ``interval`` * ``fs``).

In [21]:
def build_dataset(dataset, interval, fs, afib_beat):
    
    list_of_patients = os.listdir('data/'+dataset)
    list_of_patients= ['data/'+dataset+'/' + i[:-4] for i in list_of_files if i.endswith('.atr')]
    list_of_patients_test = [list_of_files[1]]
    
    # Initialize the arrays
    num_cols = 2 * interval * fs  # This specify the length of the heartbeats to be extracted.
    #X = np.zeros((1,num_cols))
    #Y = np.zeros((1,1))
    X = np.ones((1,num_cols))
    Y = np.ones((1,1))
    annot_symb = []
    ids = []
    """
    df=pd.DataFrame({'types':[],'val':[], 'patient_id':[]})
    values, counts = np.unique(annot_symbol, return_counts=True)
    df1 = pd.DataFrame({'types':values, 'val':counts, 'patient_id':patients[0]*len(counts)})
    """
    # This list stores the number of extracted heartbeats for each patient.
    num_beats = []
    
    #for patient_id in list_of_patients:
    for patient_id in list_of_patients_test:
        file_path = os.getcwd() + '/data/'+dataset+'/'+patient_id
        
        ecg_sig, annot_type, annot_sample = read_ecg_not_afdb(dataset, file_path, fs)
        
        
        # Since there are two leads of ECGs for each record, we only select the first lead to work with.
        ##moved further up in preprocess
        #ecg_sig = ecg_sig[:,0]
        
        
        #Filter baseline wander
        #############
        ecg_sig = bw_filt(ecg_sig, fs)
        #print("ecg_sig shape")
        #print(np.shape(ecg_sig))
        
        #Apply lowpass filter  at 100Hz
        ##**********
        ecg_sig = butter_lowpass_filter(ecg_sig, 100, fs)
        
        # We simply remove the "non-beats" beats from the df_annot dataframe and only keep "normal" and "abnormal" beats.
        df_annot = pd.DataFrame({'annot_type':annot_type,
                              'annot_sample':annot_sample})
        
        f.write(f'N before removal is {len(df_annot)}  for id {patient_id}\n')
        #print(df_annot.describe())
        
        df_annot = df_annot.loc[df_annot.annot_type.isin(afib_beat + ['N'])]
        f.write(f'N after removal is {len(df_annot)}  for id {patient_id}\n')
        
        #print(df_annot.describe())
        
        # The "make_XY" builds the x and y matrics for each extracted heartbeat
        x, y, symbol = make_XY(ecg_sig, df_annot, interval, num_cols, afib_beat)
        annot_symb = annot_symb + symbol
        num_beats.append(x.shape[0])
        X = np.append(X,x,axis = 0)
        Y = np.append(Y,y,axis = 0)
        id_toappend = []
        id_toappend = [patient_id] * len(x)
        ids = ids+id_toappend
        
    # remove top index from both X and Y since these were just used to initialize
    X = np.delete(X,0,axis=0)
    Y = np.delete(Y,0,axis=0)
    
    """
    #oneHotEncode y
    print(Y[1:10])
    encoder = OneHotEncoder(handle_unknown='ignore')
    Y = encoder.fit_transform(Y).toarray()
    print(Y[1:10])

    #transform y to int
    print(Y.dtype)
    Y = np.rint(Y).astype(int)
    print(Y.dtype)
    print(Y[1:10])
    """

    #confirm that arrays are same length
    print(np.shape(ids))   
    print(np.shape(X))
    print(np.shape(Y))
    
    return X, Y, annot_symb, ids


def make_XY(ecg_sig, df_annot, interval, num_cols, afib_beat):
    # this function builds the X,Y matrices for each beat
    # it also returns the original symbols for Y
    
    num_row = len(df_annot)
    print("new_row")
    print(num_row)

    x = np.zeros((num_row, num_cols))
    y = np.zeros((num_row,1))
    symbol = []
    
    # count the rows
    row_size = 0
    nextleft= interval*fs
    
    for annot_sample, annot_type in zip(df_annot.annot_sample.values, df_annot.annot_type.values):

        left = max([0,(annot_sample - interval*fs) ])
        right = min([len(ecg_sig),(annot_sample + interval*fs) ])
        #make condition to control overlap between segments
        if left >= nextleft:      
            xx = ecg_sig[left: right]
            #check that segments are correct size:
            if len(xx) == num_cols:
                x[row_size,:] = xx
                #y[row_size,:] = int(annot_type in afib_beat)
                
                #generate list with symbols of beats included in window
                symblist = df_annot.loc[(df_annot.annot_sample >= left) & (df_annot.annot_sample<= right)]
                #print(symblist.head(20))
                symblist = list(symblist.annot_type)
                symb_vote = compare(symblist,r)
                y[row_size,:] = int(symb_vote in afib_beat)
                #symbol.append(annot_type)
                symbol.append(symb_vote)
                row_size += 1
                nextleft = left + int(round(2*fs*interval*(1-overlap)))
    
    """Original
    for annot_sample, annot_type in zip(df_annot.annot_sample.values, df_annot.annot_type.values):

        left = max([0,(annot_sample - interval*fs) ])
        right = min([len(ecg_sig),(annot_sample + interval*fs) ])
        xx = ecg_sig[left: right]
        if len(xx) == num_cols:
            x[row_size,:] = xx
            y[row_size,:] = int(annot_type in afib_beat)
            symbol.append(annot_type)
            row_size += 1
    """
    x = x[:row_size,:]
    y = y[:row_size,:]
    return x, y, symbol


In [36]:
patients =['04043'] 
f = open("log.txt", "w")

interval = 30 
fs = 250
r = 0.5
overlap = 0.75


def compare(symbols, r):
        numN = sum(map( lambda x: x == 'N', symbols))
        n = len(symbols)
        if numN > n * r:
            return 'N'
        else:
            return 'A'


        
###use to check with masters exctraction of beats/symbols
X, y, annot, ids = build_dataset(patients, interval, fs, afib_beat)

f.close()

new_row
61890
(2351,)
(2351, 15000)
(2351, 1)


In [18]:
foo =['N','N','N','N','N','N','N','ø','ø','ø','ø','ø']
print(len(foo))
w = compare(foo,r)
w

12


'N'

In [16]:
def compare(symbols, r):
        numN = sum(map( lambda x: x == 'N', symbols))
        n = len(symbols)
        if numN > n * r:
            return 'N'
        else:
            return 'A'

In [34]:
##adjust read_ecg for other datasets
"""
wfdb.processing.resample_singlechan(x, ann, fs, fs_target)
Resample a single-channel signal with its annotations.

x: ndarray
The signal array.

annWFDB Annotation
The WFDB annotation object.

fsint, float
The original frequency.

fs_targetint, float
The target frequency.

resampled_xndarray
Array of the resampled signal values.

resampled_annWFDB Annotation
Annotation containing resampled annotation locations.
"""


from wfdb.processing import resample_singlechan
 
def read_ecg_not_afdb(dataset, path, fs_dt):
    
    # Read ECG signal
    record = wfdb.rdrecord(path)
    
    # Get the ECG signal
    ecg_sig = record.p_signal
    # Since there are two leads of ECGs for each record, we only select the first lead to work with.
    ecg_sig = ecg_sig[:,0]
    
    # Read corresponding annotation file
    #annot = wfdb.rdann(path, 'atr')
    
    if dataset == "mitdb":
        ann = wfdb.rdann(path, 'atr')

        #resample to 250Hz like AFDB 
        ecg_sig_r, ann_r = wfdb.processing.resample_singlechan(ecg_sig, ann, fs_dt, 250)
        annot_symbol = pd.Series(ann_r.symbol)
        annot_sample = pd.Series(ann_r.sample)

    #QRS = wfdb.rdann(path, 'qrs')
   
    #get RPeak location (index) 
    #annot_sample = pd.Series(QRS.sample)
    
   
    #Rhythm = pd.Series(ann.aux_note)    
    """
    #extract and assign labels to individual beats. 
    #Needed since beat types are only defined in intervals in .atr file

    df1 = pd.DataFrame({'Rpeak': QRS.sample})
    df1["label"]=np.nan
    ##print(df1.head(10))
    df2 = pd.DataFrame({'Rpeak': ann.sample,"label": ann.aux_note})
    ##print("dataframe2")
    ##print(df2.head(10))
    newdf = pd.concat([df1,df2], keys = ['1', '2'])
    newdf=newdf.sort_index(ascending=False) 
    #make sure that values from annotation file is on top even if Rpeaks are identical. Important for ffill
    newdf=newdf.sort_values(by=['Rpeak'])
    #perform forward ffill to get annotation to all Rpeaks in QRS.sample
    newdf=newdf.ffill()
    #discard all Rpeaks+Annotations from ann.sample. Avoid adding extra or duplicate beats 
    newdf=newdf.loc[['1']]
     #convert label col to series for further processing 
    annot_symbol = pd.Series(list(newdf["label"]))
    """
   
    
    annot_symbol = annot_symbol.replace('(AFIB', 'A')
    annot_symbol = annot_symbol.replace('(N', 'N')
    
    return ecg_sig_r, annot_symbol, annot_sample

In [37]:
#def build_dataset(patients, interval, fs, afib_beat):
def build_dataset(dataset, fs_dt, afib_beat, interval=4):
    
    """set initial variables not passed in"""
    #AFDB sampling rate 250hz
    fs = 250
    # r is the ratio of beats that must be normal in order for window label to be characterized as 'normal' 
    r = 0.5
    #maximum overlap of windows in ecg_signal
    overlap = 0.5
    
    list_of_patients = os.listdir('data/'+dataset)
    #list_of_patients= ['data/'+dataset+'/' + i[:-4] for i in list_of_patients if i.endswith('.atr')]
    list_of_patients= [i[:-4] for i in list_of_patients if i.endswith('.atr')]

    print(list_of_patients)
    list_of_patients_test = [list_of_patients[1]]
    
    # Initialize the arrays
    num_cols = 2 * interval * fs  # This specify the length of the heartbeats to be extracted. 
    #OK to use 250Hz because signal will be resampled in 'read_ECG' to 250Hz
    #X = np.zeros((1,num_cols))
    #Y = np.zeros((1,1))
    X = np.ones((1,num_cols))
    Y = np.ones((1,1))
    annot_symb = []
    ids = []
    """
    df=pd.DataFrame({'types':[],'val':[], 'patient_id':[]})
    values, counts = np.unique(annot_symbol, return_counts=True)
    df1 = pd.DataFrame({'types':values, 'val':counts, 'patient_id':patients[0]*len(counts)})
    """
    # This list stores the number of extracted heartbeats for each patient.
    num_beats = []
    
    for patient_id in list_of_patients:
        file_path = os.getcwd() + '/data/'+dataset+'/' + patient_id
        
        ecg_sig, annot_type, annot_sample = read_ecg_not_afdb(dataset, file_path, fs_dt)
        
        # Since there are two leads of ECGs for each record, we only select the first lead to work with.
        ##moved further up in preprocess
        #ecg_sig = ecg_sig[:,0]
        
        #check what types and how many in each patient
        values, counts = np.unique(annot_type, return_counts=True)
        df = pd.DataFrame({'label':values, 'val':counts})
        #df = pd.DataFrame({'label': ann.symbol})
        #df1 = pd.DataFrame({'Rpeak': QRS.sample})
        print(df.groupby('label').val.sum())
        
        
        #Filter baseline wander
        #############
        ecg_sig = bw_filt(ecg_sig, fs)
        #print("ecg_sig shape")
        #print(np.shape(ecg_sig))
        
        #Apply lowpass filter  at 100Hz
        ##**********
        ecg_sig = butter_lowpass_filter(ecg_sig, 100, fs)
        
        # We simply remove the "non-beats" beats from the df_annot dataframe and only keep "normal" and "abnormal" beats.
        df_annot = pd.DataFrame({'annot_type':annot_type,
                              'annot_sample':annot_sample})
        
        f.write(f'N before removal is {len(df_annot)}  for id {patient_id}\n')
        #print(df_annot.describe())
        
        df_annot = df_annot.loc[df_annot.annot_type.isin(afib_beat + ['N'])]
        f.write(f'N after removal is {len(df_annot)}  for id {patient_id}\n')
        
        #print(df_annot.describe())
        
        # The "make_XY" builds the x and y matrics for each extracted heartbeat
        x, y, symbol = make_XY(ecg_sig, df_annot, interval, num_cols, afib_beat)
        annot_symb = annot_symb + symbol
        num_beats.append(x.shape[0])
        X = np.append(X,x,axis = 0)
        Y = np.append(Y,y,axis = 0)
        id_toappend = []
        id_toappend = [patient_id] * len(x)
        ids = ids+id_toappend
        
    # remove top index from both X and Y since these were just used to initialize
    X = np.delete(X,0,axis=0)
    Y = np.delete(Y,0,axis=0)
    
    #oneHotEncode y
    print(Y[1:10])
    encoder = OneHotEncoder(handle_unknown='ignore')
    Y = encoder.fit_transform(Y).toarray()
    print(Y[1:10])

    #transform y to int
    print(Y.dtype)
    Y = np.rint(Y).astype(int)
    print(Y.dtype)
    print(Y[1:10])

    #confirm that arrays are same length
    print(np.shape(ids))   
    print(np.shape(X))
    print(np.shape(Y))
    
    return X, Y, annot_symb, ids


def make_XY(ecg_sig, df_annot, interval, num_cols, afib_beat):
    # this function builds the X,Y matrices for each beat
    # it also returns the original symbols for Y
    
    num_row = len(df_annot)
    print("new_row")
    print(num_row)

    x = np.zeros((num_row, num_cols))
    y = np.zeros((num_row,1))
    symbol = []
    
    # count the rows
    row_size = 0
    nextleft= interval*fs
    
    for annot_sample, annot_type in zip(df_annot.annot_sample.values, df_annot.annot_type.values):

        left = max([0,(annot_sample - interval*fs) ])
        right = min([len(ecg_sig),(annot_sample + interval*fs) ])
        #make condition to control overlap between segments
        if left >= nextleft:      
            xx = ecg_sig[left: right]
            #check that segments are correct size:
            if len(xx) == num_cols:
                x[row_size,:] = xx
                #y[row_size,:] = int(annot_type in afib_beat)
                
                #generate list with symbols of beats included in window
                symblist = df_annot.loc[(df_annot.annot_sample >= left) & (df_annot.annot_sample<= right)]
                #print(symblist.head(20))
                symblist = list(symblist.annot_type)
                symb_vote = compare(symblist,r)
                y[row_size,:] = int(symb_vote in afib_beat)
                #symbol.append(annot_type)
                symbol.append(symb_vote)
                row_size += 1
                nextleft = left + int(round(2*fs*interval*(1-overlap)))
    
    x = x[:row_size,:]
    y = y[:row_size,:]
    return x, y, symbol


def compare(symbols, r):
        numN = sum(map( lambda x: x == 'N', symbols))
        n = len(symbols)
        if numN > n * r:
            return 'N'
        else:
            return 'A'



In [38]:
f = open("log_mitdb.txt", "a")

#interval = 4


afib_beat = ['A','a']

build_dataset('mitdb', 360, afib_beat)

f.close()

['118', '124', '119', '122', '123', '121', '109', '108', '219', '231', '230', '232', '233', '223', '222', '220', '234', '208', '209', '221', '210', '205', '213', '207', '212', '202', '203', '217', '201', '215', '228', '214', '200', '111', '105', '104', '106', '112', '113', '107', '103', '117', '116', '102', '114', '100', '101', '115']
label
+       1
A      96
R    2166
V      16
x      10
~      12
Name: val, dtype: int64
new_row
96
label
+      13
A       2
F       5
J      29
R    1531
V      47
j       5
~       2
Name: val, dtype: int64
new_row
2
label
+     103
N    1543
V     444
~       4
Name: val, dtype: int64
new_row
1543
label
+       1
N    2476
|       2
Name: val, dtype: int64
new_row
2476
label
+       1
N    1515
V       3
Name: val, dtype: int64
new_row
1515
label
+       1
A       1
N    1861
V       1
~      12
Name: val, dtype: int64
new_row
1862
label
+       1
F       2
L    2492
V      38
~       2
Name: val, dtype: int64
new_row
0
label
+       1
A       4
F   

In [12]:
#Speciale/masters/data/afdb/data_features.pkl
import pandas as pd
import os

print(os.getcwd())
#+ '/data/mitdb/data_features.pkl')  

#Speciale/masters/data/afdb/data_features.pkl

df = pd.read_pickle(os.getcwd()+'/data/afdb/data_features.pkl')  
df.groupby('id').label.describe(include='all')
df.groupby('label').id.describe(include='all')

/Users/vegy-math808y/Speciale/masters


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,25430.0,13.003421,6.482442,1.0,7.0,12.0,19.0,23.0
N,30382.0,11.22905,6.510418,1.0,5.0,12.0,16.0,23.0
O,601.0,11.425957,6.767195,2.0,2.0,15.0,15.0,23.0


In [18]:


#oneHotEncode y
print(y[1:10])
encoder = OneHotEncoder(handle_unknown='ignore')
y = encoder.fit_transform(y).toarray()
print(y[1:10])



NameError: name 'y' is not defined

In [29]:
df = pd.DataFrame(data = y) 
values, counts = np.unique(y, return_counts=True)
df1 = pd.DataFrame({'types':values, 'val':counts})
    
df1



Unnamed: 0,types,val
0,0.0,49877
1,1.0,49877


In [10]:
#np.save("data/afdb/dl6/X_BWcorr_LP_filt100.npy",np.array(X))
#np.save("data/afdb/dl6/y.npy",np.array(y))
#np.save("data/afdb/dl6/annot_data.npy",np.array(annot))


In [None]:
#save dataset w/o lowpass filtering
np.save("data/afdb/dl6/X_BWcorr.npy",np.array(X))
#np.save("data/afdb/dl6/y.npy",np.array(y))
#np.save("data/afdb/dl6/annot_data.npy",np.array(annot))

In [34]:
#save dataset w/o filtering
np.save("data/afdb/dl6/X_raw.npy",np.array(X))
np.save("data/afdb/dl6/y.npy",np.array(y))
np.save("data/afdb/dl6/ids",np.array(ids))

In [29]:
print('before deleting first line')
print(X[0:4])
#X=np.delete(X,0,axis=0)
#print('efter deleting first line')
#print(X[0:4])




before deleting first line
[[ 0.17   0.19   0.16  ... -0.225 -0.205 -0.225]
 [ 0.16   0.14   0.095 ... -0.12  -0.125 -0.145]
 [ 0.18   0.16   0.1   ... -0.27  -0.25  -0.23 ]
 [ 0.29   0.325  0.33  ... -0.29  -0.25  -0.27 ]]


In [33]:
print(np.shape(X))
print(np.shape(y))
print(np.shape(annot))
print(np.shape(ids))

df = pd.DataFrame(data = y) 
values, counts = np.unique(y, return_counts=True)
df1 = pd.DataFrame({'types':values, 'val':counts})
    
df1
#'t':values, 'val':counts, 'patient_id':[patient_id]*len(counts)})
# df1 = pd.DataFrame({'types':values, 'val':counts, 'patient_id':[patient_id]*len(counts)})
#

(1116465, 1500)
(1116465, 1)
(1116465,)
(1116465,)


Unnamed: 0,types,val
0,0.0,608076
1,1.0,508389


https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html


In [54]:
to_append = pd.read_pickle(os.getcwd() + '/data/mitdb/data_features.pkl')  
to_append.groupby('id').label.describe(include='all')

Unnamed: 0_level_0,count,unique,top,freq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,115,1,O,115
2.0,81,1,O,81
3.0,104,1,N,104
4.0,123,1,N,123
5.0,75,1,N,75
6.0,93,1,N,93
7.0,126,1,O,126
8.0,91,1,N,91
9.0,115,2,N,114
10.0,100,2,O,75
