In [7]:
!pip install wfdb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [8]:
import numpy as np
import wfdb
import h5py
import pandas as pd
import os
import gc
import keras
from numpy.random import seed
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import label_binarize,  normalize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout, Flatten,Conv2D, BatchNormalization,MaxPooling2D, ReLU
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix,classification_report
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from tensorflow.python.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
seed(7)

In [9]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [10]:
# Parâmetros Gerais
N_samples = 6000 # number of samples in one 30 second epoch
N_channels = 6 # number of EEG channels used
num_classes = 3
FreqSample = 200
# lookback = 10
step = 2#
num_patient_per_block = 4
epochs = 5
batch_size = 128

In [11]:
def find(condition):
    res, = np.nonzero(np.ravel(condition))
    return res
# # -----------------------------------------------------------------------------
# # Retorna um array com os 6 channels EEG e a Frequencia de amostragem Fs
# # exem> file_name = '/content/drive/My Drive/data/training/tr03-0005/tr03-0005'
# # -----------------------------------------------------------------------------
def import_data(file_name): 

    this_data, fields = wfdb.io.rdsamp(file_name, channels=[0,1,2,3,4,5])
    # signal_names  = fields['sig_name']
    Fs = fields['fs']
    # n_signals = len(fields['sig_name'])
    n_samples = fields['sig_len']
    return this_data, Fs, n_samples #saida [n_samples,6]

# # -----------------------------------------------------------------------------
# # Importe o vetor de labels, dado o nome do arquivo.
# # e.g. /training/tr04-0808/tr04-0808-arousal.mat
# # -----------------------------------------------------------------------------
def import_arousals(file_name): 
  # Importa os aurosals do resgistro
  file_name = file_name + '-arousal.mat'
  f = h5py.File(file_name, 'r') 
  arousals = np.array(f['data']['arousals'])
  return arousals 

def get_files(): 
      
    header_loc, arousal_loc, signal_loc = [], [], []
    # rootDir = 'C:/Users/hp/Documents/Modelos_tesis/data/training'
    rootDir ='/content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data'
    for dirName, subdirList, fileList in os.walk(rootDir, followlinks=True):
    
        for fname in fileList:
            if '.hea' in fname:
                header_loc.append(dirName + '/' + fname)
            if '-arousal.mat' in fname:
                arousal_loc.append(dirName + '/' + fname)
            if 'mat' in fname and 'arousal' not in fname:
                signal_loc.append(dirName + '/' + fname)
    
    # combine into a data frame
    data_locations = {'header':      header_loc,
                      'arousal':     arousal_loc,
                      'signal':      signal_loc, 
    
                      }
    
    # Convert to a data-frame
    df = pd.DataFrame(data=data_locations, columns=['header', 'arousal','signal'])

    return df

def adjust_data(record_name):
    x = []
    eeg_raw,Fs,N = import_data(record_name) #[len_data,6]
    eeg_raw = np.expand_dims(eeg_raw, axis=0) # [1,len_data, 6]
    # eeg_raw = eeg_raw.reshape(1,1,eeg_raw.shape[0],6) #[1,1,len_data, 6]
    n_samples = 30*Fs
    for k in range(0, (N-n_samples+1), n_samples):
      # data = eeg_raw[:,:,k:k+n_samples,:] # 4 dim
      data = eeg_raw[:,k:k+n_samples,:]
      x.append(data)
    return x, Fs #saida:lista tamanho N/Fs*30 de segmentos de tamanho [1,Fs*30, 6]

def adjust_labels(record_name, Fs):
    y = []
    labels = import_arousals(record_name)
    n_samples = 30*Fs
    
    for k in range(0, (labels.shape[0]-n_samples+1), n_samples):
      clas = np.max(labels[k:k+n_samples])
      y.append(clas)
    # y = [0.0 if (x == -1.0) else x for x in y]
 
    return y # l es en formato np., Y en lista



In [12]:
def generator(df_files,min_index, max_index, batch_size=256, step=10): 
#   '''
#   generator which yields timeseries samples and their labels on the fly
#   input:
#       dir_path: the path of the directory which contains all edf files, where features[nsrrid].shape=(k,N,M), k (=1,2) is the number of channels
#                 N is the number of epochs for one patient, M is the number of features for one epoch
#       labels: the labels for each feature vectors, where labels[nsrrid].shape=(N,)
#       lookback: how many epochs back the input data should go
#       min_index (max_index): indices of the edf files which are used to generate samples
#       shuffle: whether to shuffle the samples or draw them in chronological order
#       batch_size: the number of samples per batch
#       step: the period, in timesteps, at which you sample feature array.
#       data_set:type of set 'train', 'test' or 'val' 
#   '''
  if max_index == None:
      max_index = len(df_files)
  start_subject_index = min_index 

  while 1:
    samples = []
    labels = []
    count = 0
    # #read the data from randomly selected num_patient_per_block subjects
    selected_index_nsrrid = np.arange(start_subject_index, min(start_subject_index+num_patient_per_block,max_index))
    for index in selected_index_nsrrid: 
      record_name = df_files.header.values[index][:-4]
      eeg_raw, Fs = adjust_data(record_name)
      z = len(eeg_raw)
      count+=z

      # if N == N_channels and M == 30*FreqSample:
      cl = adjust_labels(record_name, Fs)
      for i in range(z):
          samples.append(eeg_raw[i][::step,:])
          labels.append(cl[i])

    # print(count)
    num_sample = len(samples)
    indexes = np.arange(num_sample)
    np.random.shuffle(indexes)

    for i in range(0,num_sample,batch_size):
        if i+batch_size > num_sample:
            break

        batch_sample = [samples[p] for p in indexes][i:i+batch_size]
        batch_label = np.array([labels[o] for o in indexes][i:i+batch_size])
        # print((np.array(batch_sample)*0.001).shape, to_categorical(batch_label, num_classes=num_classes).shape)
        yield np.array(batch_sample)*0.001,  to_categorical(batch_label, num_classes=num_classes)
                
    start_subject_index += num_patient_per_block
    if start_subject_index >= max_index:
        start_subject_index = min_index
            

In [13]:
def contador(df_files):
  cont = 0
  # For each subject in the DataFrame_files set...
  for i in range(0, np.size(df_files, 0)):
      gc.collect()
      record_name = df_files.header.values[i][:-4]
      signal_names, Fs, n_samples = import_data(record_name)
      cont += (n_samples//(Fs*30))
  return cont

In [14]:
def generator_test(df_files,min_index, max_index, batch_size=256, step=10): 

  if max_index == None:
      max_index = len(df_files)
  start_subject_index = min_index 

  while 1:

    samples = []
    labels = []

    # #read the data from randomly selected num_patient_per_block subjects
    selected_index_nsrrid = np.arange(start_subject_index, min(start_subject_index+num_patient_per_block,max_index))
    for index in selected_index_nsrrid: 
      record_name = df_files.header.values[index][:-4]
      print('Salvando labels gerador...')
      print(index, record_name)
      eeg_raw, Fs = adjust_data(record_name)
      z = len(eeg_raw)
      cl = adjust_labels(record_name, Fs)
      for i in range(z):
        samples.append(eeg_raw[i][:,::step,:])
        labels.append(cl[i])

    
    num_sample = len(samples)
    indexes = np.arange(num_sample)

    for i in range(0,num_sample,batch_size):
        if i+batch_size > num_sample:
            break

        batch_sample = [samples[p] for p in indexes][i:i+batch_size]
        batch_label = np.array([labels[o] for o in indexes][i:i+batch_size])
        # print((np.array(batch_sample)*0.001).shape, to_categorical(batch_label, num_classes=num_classes).shape)
        yield np.array(batch_sample)*0.001,  to_categorical(batch_label, num_classes=num_classes)

    start_subject_index += num_patient_per_block
    if start_subject_index >= max_index:
        start_subject_index = min_index

In [15]:
def weight_class():
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#class_weights
    pos, neg, null, total = 0,0,0,0
    
    for i in range(0, np.size(tr_files, 0)):
    
        record_name = tr_files.header.values[i][:-4]
        arousals = list( adjust_labels(record_name, FreqSample))
        
        p = [n for n in arousals if n  == 1]
        n = [n for n in arousals if n  == 0]
        m = [n for n in arousals if n  == -1]
        
        total += len(arousals)
        pos += len(p)
        neg += len(n)
        null += len(m)
    
    w_0 = (1/neg)*(total)/3.0
    w_1 = (1/pos)*(total)/3.0
    w_m1 = (1/null)*(total)/3.0
    
    print('Weight for class 0: {:.2f}'.format(w_0))
    print('Weight for class 1: {:.2f}'.format(w_1))
    print('Weight for class -1: {:.2f}'.format(w_m1))
    class_weight = {0: w_0, 1: 16*w_1,  -1: w_m1}
    
    return class_weight 

def focal_loss(gamma=2., alpha=.25):
    from keras import backend as k
    import tensorflow as tf
# https://arxiv.org/pdf/1708.02002.pdf
# https://www.dlology.com/blog/multi-class-classification-with-focal-loss-for-imbalanced-datasets/
    gamma = float(gamma)
    alpha = float(alpha)

    def focal_loss_fixed(y_true, y_pred):
        """Focal loss for multi-classification
        FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
        Notice: y_pred is probability after softmax
        gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
        d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
        Focal Loss for Dense Object Detection
        https://arxiv.org/abs/1708.02002

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:
            gamma {float} -- (default: {2.0})
            alpha {float} -- (default: {0.25})

        Returns:
            [tensor] -- loss.
        """
        epsilon = k.epsilon()
        y_true = tf.convert_to_tensor(y_true, tf.float32)
        y_pred = tf.convert_to_tensor(y_pred, tf.float32)

        model_out = tf.add(y_pred, epsilon)
        ce = tf.multiply(y_true, -tf.math.log(model_out))
        weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
        fl = tf.multiply(alpha, tf.multiply(weight, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    
    return focal_loss_fixed

def categorical_focal_loss(alpha, gamma=2.):
    
    from keras import backend as K

    """
    Softmax version of focal loss.
    When there is a skew between different categories/labels in your data set, you can try to apply this function as a
    loss.
           m
      FL = ∑  -alpha * (1 - p_o,c)^gamma * y_o,c * log(p_o,c)
          c=1
      where m = number of classes, c = class and o = observation
    Parameters:
      alpha -- the same as weighing factor in balanced cross entropy. Alpha is used to specify the weight of different
      categories/labels, the size of the array needs to be consistent with the number of classes.
      gamma -- focusing parameter for modulating factor (1-p)
    Default value:
      gamma -- 2.0 as mentioned in the paper
      alpha -- 0.25 as mentioned in the paper
    References:
        Official paper: https://arxiv.org/pdf/1708.02002.pdf
        https://www.tensorflow.org/api_docs/python/tf/keras/backend/categorical_crossentropy
    Usage:
     model.compile(loss=[categorical_focal_loss(alpha=[[.25, .25, .25]], gamma=2)], metrics=["accuracy"], optimizer=adam)
    """

    alpha = np.array(alpha, dtype=np.float32)

    def categorical_focal_loss_fixed(y_true, y_pred):
        """
        :param y_true: A tensor of the same shape as `y_pred`
        :param y_pred: A tensor resulting from a softmax
        :return: Output tensor.
        """

        # Scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)

        # Clip the prediction value to prevent NaN's and Inf's
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1. - epsilon)

        # Calculate Cross Entropy
        cross_entropy = -y_true * K.log(y_pred)

        # Calculate Focal Loss
        loss = alpha * K.pow(1 - y_pred, gamma) * cross_entropy

        # Compute mean loss in mini_batch
        return K.mean(K.sum(loss, axis=-1))

    return categorical_focal_loss_fixed


In [16]:
def plot_confusion_matrix(data, labels, output_filename):
    import seaborn
    import pandas.util.testing as tm
    """Plot confusion matrix using heatmap.
 
    Args:
        data (list of list): List of lists with confusion matrix data.
        labels (list): Labels which will be plotted across x and y axis.
        output_filename (str): Path to output file.
 
    """
    seaborn.set(color_codes=True)
    plt.figure(1, figsize=(9, 6))
 
    plt.title("Confusion Matrix")
 
    seaborn.set(font_scale=1)
    ax = seaborn.heatmap(data, annot=True,fmt="d", cmap="YlGnBu", cbar_kws={'label': 'Scale'})
 
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
 
    ax.set(ylabel="True Label", xlabel="Predicted Label")
    plt.savefig(output_filename, bbox_inches='tight', dpi=300)
    plt.show()
    plt.close()
 

# **Modelo** 
---
---





In [21]:
def model_cnn1(pretrained_weights=' ',uni_cama=[32,64,128,256], kernels=[3,3,3,3],de=128):
  
  conv = Sequential(name='cnn')
  conv.add(Conv2D(uni_cama[0], (1,kernels[0]), activation = 'relu',padding = 'same', input_shape = (1, 30*FreqSample//step, 6)))
  conv.add(MaxPooling2D(pool_size=(1,2)))
  conv.add(Conv2D(uni_cama[1], (1,kernels[1]), activation = 'relu'))
  conv.add(MaxPooling2D(pool_size=(1,2)))
  conv.add(Conv2D(uni_cama[2], (1,kernels[2]), activation = 'relu'))
  conv.add(MaxPooling2D((1,2)))
  conv.add(Conv2D(uni_cama[3], (1,kernels[3]), activation = 'relu'))
  conv.add(MaxPooling2D(1,2))
  conv.add(Flatten())
  conv.add(Dense(de, activation = 'linear'))
  conv.add(BatchNormalization())
  conv.add(ReLU())
  conv.add(Dense(num_classes, activation = 'sigmoid'))
  conv.summary()

  conv.load_weights(pretrained_weights)

  return conv




In [18]:
def mis_metricas(predict, labels, pw):
  
  from sklearn.metrics import confusion_matrix,classification_report

  y_test_non_category = [np.argmax(t) for t in labels]
  y_predict_non_category = [np.argmax(t) for t in predict]
  precision, recall, thresholds = precision_recall_curve(y_test_non_category,
                                                          y_predict_non_category, pos_label=1)
  auprc = auc(recall, precision)
  auroc = roc_auc_score(labels, predict,multi_class='ovr')
  print()
  print('auroc %f e auprc %f'% (auroc, auprc))
  print()

  if num_classes == 2:
    nomlabels = ['0','1']
  else:
    nomlabels = ['0','1', '-1']

  cm = confusion_matrix(y_test_non_category, y_predict_non_category)

  print(cm)
  print()

  report = classification_report(y_test_non_category,
                                y_predict_non_category,target_names=nomlabels)
  print(report)

  # create confusion matrix
  plot_confusion_matrix(cm, nomlabels, pw.replace(".hdf5","_confu_matriz.png"))


# Evaluando los modelos

In [19]:
files = get_files() 
files

Unnamed: 0,header,arousal,signal
0,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...
1,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...
2,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...
3,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...
4,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...
5,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...
6,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...,/content/drive/MyDrive/Colab Notebooks/ENTREGA...


In [None]:
pw = '/content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/Pesos/cnn1_t1.hdf5'
model = model_cnn1(pretrained_weights = pw)
steps_te = contador(files)//batch_size 

np.random.seed(seed=0)
test_data = generator_test(files, min_index=0, max_index=None, batch_size=batch_size, step=step)
predict = model.predict(test_data, steps_te, use_multiprocessing=False)

np.random.seed(seed=0)
test_data = generator_test(files, min_index=0, max_index=None, batch_size=batch_size, step=step)
labels = []   # store all the generated label batches
max_iter = steps_te  # maximum number of iterations, in each iteration one batch is generated; the proper value depends on batch size and size of whole data

h = 0
for d, l in test_data:
    for j in range(batch_size):
      labels.append(l[j])
    h += 1
    if h == max_iter:
        break

y_true = np.array(labels) # categorical
print(y_true.shape, predict.shape)

mis_metricas(predict, y_true,pw)

np.random.seed(seed=0)
test_data = generator_test(files, min_index=0, max_index=None, batch_size=batch_size, step=step, red='cnn')
scores_test = model.evaluate(test_data,steps=steps_te,verbose=1)
print('AUROC: ', scores_test[2])
print('AUPRC: ' , scores_test[3])



[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
Salvando labels gerador...
6 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0005/tr03-0005
Salvando labels gerador...
0 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0257/tr03-0257
Salvando labels gerador...
1 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0314/tr03-0314
Salvando labels gerador...
2 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0401/tr03-0401
Salvando labels gerador...
3 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0086/tr03-0086
Salvando labels gerador...
4 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0187/tr03-0187
Salvando labels gerador...
5 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0141/tr03-0141
Salvando labels gerador...
6 /content/drive/MyDrive/Colab Notebooks/ENTREGA INF2102 PFP/data/tr03-0005/tr03-0005
Salvando labels gerador