In [1]:
# from _future_ import absolute_import, division, print_function
%load_ext autoreload
%autoreload 2

import sys, os
script_dir  = os.path.normpath(os.path.abspath("."))
root_dir    = os.path.normpath(os.path.abspath(script_dir + "/../../../.."))
source_dir  = os.path.normpath(os.path.abspath(script_dir + "/../.."))
if source_dir in sys.path: sys.path.remove(source_dir)
sys.path.insert(1, source_dir)

import warnings
warnings.filterwarnings("ignore")

In [2]:
from common import *

In [3]:
dp = DataPath()

In [4]:
load_path  = f'{dp.output_path}/seq_norm_8_nots'
train_path = f'{dp.output_path}/seq_norm_8_nots/train_VAE_Kfold'
if os.path.exists(train_path) == False: os.makedirs(train_path)

In [5]:
data_info = dict(np.load(f'{load_path}/data_info.npz'))

In [6]:
load_info = joblib.load(f'{load_path}/save_info.joblib')
df_data   = pd.read_hdf(f'{load_path}/df_data.hdf')
scaler = joblib.load(f'{load_path}/scaler.joblib')

In [7]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os
import random
import torch
import sys
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from tqdm import tqdm
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import classification_report, confusion_matrix

# Param

In [8]:
# params
seed = 42
num_folds = 5
scoring = "roc_auc"
batch_size = 1028

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    tf.random.set_seed(seed)

seed_everything(seed)

# Load Data

In [9]:
x_train = data_info["x"]
y_train = data_info["seq_y"]
y_onehot = np.zeros((len(data_info["seq_y"]),2), dtype=np.float32)
for idx in range(len(y_train)):
    y_onehot[idx, y_train[idx]] = 1.0

print(f'x: {x_train.shape}')
print(f'y: {y_train.shape}')
print(f'y_onehot: {y_onehot.shape}')

x: (298701, 8, 25)
y: (298701,)
y_onehot: (298701, 2)


# Metrics

In [10]:
#Loss
from tensorflow.keras import backend as K

smooth  = 1.
epsilon = 1e-7

def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
# dice_coef

def dice_coef_loss(y_true, y_pred):
    return 1.0 - dice_coef(y_true, y_pred)

# dice_coef_loss
def dice_coef_multi(y_true, y_pred):
    y_true_f = K.flatten(y_true[..., 1:])
    y_pred_f = K.flatten(y_pred[..., 1:])

    y_true_sum = K.sum(K.cast(y_true_f > epsilon, dtype="float32"))
    y_pred_sum = K.sum(y_pred_f)

    intersection = K.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (y_true_sum + y_pred_sum + smooth)
# dice_coef_multi

def dice_coef_multi_loss(y_true, y_pred):
    return 1.0 - dice_coef_multi(y_true, y_pred)
# dice_coef_multi_loss

def mean_acc(y_true, y_pred):
    y_true_label = K.argmax(y_true, axis = 1)
    y_pred_label = K.argmax(y_pred, axis = 1)
    cm = tf.math.confusion_matrix(y_true_label, y_pred_label)
    cm_norm = cm / tf.reshape(tf.reduce_sum(cm, axis = 1), (-1, 1))
    zero_pos = tf.where(tf.math.is_nan(cm_norm))
    n_zero   = tf.shape(zero_pos)[0]
    cm_norm  = tf.tensor_scatter_nd_update(cm_norm, zero_pos, tf.zeros(n_zero, dtype=tf.double))
    mean_acc_val = tf.reduce_mean(tf.linalg.diag_part(cm_norm))
#   print(y_true.shape)
#     if tf.math.is_nan(mean_acc_val) == True:
#         print("Is Nan")
#         print(y_true_label.numpy())
#         print(y_pred_label.numpy())
#         print(cm_norm.numpy())
    return mean_acc_val

In [11]:
metrics = ["acc", dice_coef_multi, mean_acc, tf.keras.metrics.AUC()]
loss_fn = ["categorical_crossentropy", dice_coef_multi_loss] # "categorical_crossentropy",
optimizer_fn = tf.keras.optimizers.Adam(learning_rate=0.0001)
weights = None

# Model

In [12]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [13]:
#Create a sampling layer

class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [14]:
#TVAE
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, roc_curve

tf.keras.backend.clear_session()


def build_VAE(input_shape, latent_dim):
    #Encoder
    encoder_inputs = keras.Input(shape=input_shape)
    x = layers.LSTM(100, activation='tanh', return_sequences=True)(encoder_inputs)
    x = layers.LSTM(50, activation='tanh', return_sequences=True)(x)
    x = layers.LSTM(25,activation='tanh')(x)

    z_mean = layers.Dense(latent_dim, name="z_mean")(x)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
    z = Sampling()([z_mean, z_log_var])

    # Clf decoder
    decoder1 = layers.Dense(8, activation='relu')(z)
    decoder1 = layers.Dropout(0.2)(decoder1)

    decoder1 = layers.Dense(64, activation='relu')(decoder1)
    decoder1 = layers.Dropout(0.2)(decoder1)

    decoder1 = layers.Dense(32, activation='relu')(decoder1)
    decoder1 = layers.Dropout(0.2)(decoder1)

    decoder1 = layers.Dense(16, activation='relu')(decoder1)
    decoder1 = layers.Dropout(0.1)(decoder1)

    decoder_out = layers.Dense(2, activation='sigmoid')(decoder1)
    VAE_clf = tf.keras.Model(inputs=encoder_inputs, outputs=decoder_out)

    VAE_clf.compile(
            optimizer = optimizer_fn,
            loss      = loss_fn,
            metrics   = metrics,
            run_eagerly = True,
        )
    
    return VAE_clf
    #model.compile(
    #optimizer='adam',
    #loss='categorical_crossentropy',
    #metrics='accuracy',
    #
    
VAE_clf = build_VAE(input_shape = x_train.shape[1:], latent_dim = 8)
VAE_clf.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 8, 25)]      0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 8, 100)       50400       ['input_1[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 8, 50)        30200       ['lstm[0][0]']                   
                                                                                                  
 lstm_2 (LSTM)                  (None, 25)           7600        ['lstm_1[0][0]']                 
                                                                                              

In [15]:
x_train.shape

(298701, 8, 25)

In [17]:
tf.keras.backend.clear_session()

#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, roc_curve, average_precision_score

kf = KFold(n_splits=5, random_state=seed, shuffle=True)
stra_kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

result_info = {}
count = 0
#date  = time().time()
for train_index, test_index in kf.split(x_train, y_train):
    #tf.keras.backend.clear_session()
    x_train1, x_test1 = x_train[train_index], x_train[test_index]
    y_train1, y_test1 = y_train[train_index], y_train[test_index]

    y_train_onehot1 = y_onehot[train_index]
    y_test_onehot1 = y_onehot[test_index]

    rs_path = f'{train_path}/rs'
    if os.path.exists(rs_path) == False: os.makedirs(rs_path)
    
    #logs_path = f'{train_path}/logs_ECG/Fold_{count}'
    logs_path = f'{train_path}/logs_VAE_Fold_{count}'
    if os.path.exists(logs_path) == False: os.makedirs(logs_path)
    print(count)
    
    best_model_path = f'{train_path}/model_VAE_KFold_{count}.hdf5'
    log_model_path  = f'{train_path}/model_VAE_KFold_{count}.logs.csv'
    print(best_model_path)
    
    if os.path.exists(best_model_path):
        print(f'Remove {best_model_path}')
        os.remove(best_model_path)
        !ls "$logs_path"
    
    if os.path.exists(log_model_path):
        print(f'Remove {log_model_path}')
        os.remove(log_model_path)
        !ls "$logs_path"
    

    cbs = []
    cbs.append(tf.keras.callbacks.ModelCheckpoint(
        filepath=best_model_path,
        save_weights_only=True,
        monitor='loss',
        mode='min',
        #monitor='val_acc',
        #mode='max',
    # monitor='val_mean_acc',
    # mode='max',
        verbose=1,
        save_best_only=True))

    cbs.append(tf.keras.callbacks.TensorBoard(log_dir=f'{logs_path}'))
    cbs.append(tf.keras.callbacks.EarlyStopping(monitor = 'loss', mode='min', patience=10))
    cbs.append(tf.keras.callbacks.CSVLogger(filename=log_model_path, separator=",", append=False))
    
    model = build_VAE(input_shape = x_train.shape[1:], latent_dim = 8)

    model.fit(
            x_train1,
            y_train_onehot1,
            epochs=1000,
            batch_size=1024*2,
            #validation_split = 0.2,
            #validation_data=(x_test1, y_test_onehot1),
            shuffle=True,
            #verbose=2,
            callbacks=cbs,
            #class_weight = class_weight
        )
    
    
    model.load_weights(best_model_path)
                   
    # predict test-fold
    y1_test_pred_score = model.predict(x_test1)
    y1_test_pred_label = np.argmax(y1_test_pred_score, axis = 1)

    y1_test_cm_norm = confusion_matrix(y_test1, y1_test_pred_label, normalize='true')
    y1_test_cm      = confusion_matrix(y_test1, y1_test_pred_label)

    fpr, tpr, thr = roc_curve(y_test1, y1_test_pred_label, pos_label=1)
    y1_auc = "%.5f" % auc(fpr, tpr)
    y1_auc_prob = roc_auc_score(y_test1, y1_test_pred_score[:,1])

    precision, recall, _ = precision_recall_curve(y_test1, y1_test_pred_label)
    y1_prc = "%.5f" % auc(precision, recall)
    avg_pre = average_precision_score(y_test1, y1_test_pred_label, pos_label=1)

    

    print(f'TestFold {count} Mean-Acc: {np.mean(y1_test_cm_norm.diagonal())}\n{y1_test_cm}\n')
    print(f'TestFold {count} Mean-Auc: {y1_auc}')
    print(f'TestFold {count} AUC proba: {y1_auc_prob}')
    print(f'TestFold {count} Mean-PRC: {y1_prc}')
    print(f'TestFold {count} AVG-PRC: {avg_pre}')
    
    res_VAE_VS = {
    "true" : y_test1,
    "predict" : y1_test_pred_label,
    "score_0" : y1_test_pred_score[:,0],
    "score_1" : y1_test_pred_score[:,1],
    "AUC" : y1_auc,
    "AUC_prob" : y1_auc_prob,
    "PRC" : y1_prc,
    "avg_PRC" : avg_pre
    }

    res_VAE_VS = pd.DataFrame(res_VAE_VS)
    
    res_VAE_VS.to_csv(f'{rs_path}/res_VAE_Fold_{count}.csv')
    
    #if os.path.exists(rs_path) == True:
     #   print(f'Remove {rs_path}')
      #  os.remove(rs_path)
       # !ls "$rs_path"
    
    tf.keras.backend.clear_session()
    
    count +=1
    #break
    pass


0
/media/nghia/Nguyen NghiaW/RRS-2021/20210322_RRS/RRS/3yrs_refined_data/seq_norm_8_nots/train_VAE_Kfold/model_VAE_KFold_0.hdf5
Remove /media/nghia/Nguyen NghiaW/RRS-2021/20210322_RRS/RRS/3yrs_refined_data/seq_norm_8_nots/train_VAE_Kfold/model_VAE_KFold_0.hdf5
train
Remove /media/nghia/Nguyen NghiaW/RRS-2021/20210322_RRS/RRS/3yrs_refined_data/seq_norm_8_nots/train_VAE_Kfold/model_VAE_KFold_0.logs.csv
train
Epoch 1/1000
Epoch 00001: loss improved from inf to 0.11612, saving model to /media/nghia/Nguyen NghiaW/RRS-2021/20210322_RRS/RRS/3yrs_refined_data/seq_norm_8_nots/train_VAE_Kfold/model_VAE_KFold_0.hdf5
Epoch 2/1000
Epoch 00002: loss improved from 0.11612 to 0.02099, saving model to /media/nghia/Nguyen NghiaW/RRS-2021/20210322_RRS/RRS/3yrs_refined_data/seq_norm_8_nots/train_VAE_Kfold/model_VAE_KFold_0.hdf5
Epoch 3/1000
Epoch 00003: loss improved from 0.02099 to 0.01679, saving model to /media/nghia/Nguyen NghiaW/RRS-2021/20210322_RRS/RRS/3yrs_refined_data/seq_norm_8_nots/train_VAE_Kf

In [18]:
tf.keras.backend.clear_session()

#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, roc_curve

kf = KFold(n_splits=5, random_state=seed, shuffle=True)
stra_kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)

result_info = {}
count = 0
for train_index, test_index in kf.split(x_train, y_train):
    #tf.keras.backend.clear_session()
    x_train1, x_test1 = x_train[train_index], x_train[test_index]
    y_train1, y_test1 = y_train[train_index], y_train[test_index]
    print('train:',np.unique(y_train1, return_counts = True))
    print('test:',np.unique(y_test1, return_counts = True))

train: (array([0, 1]), array([238161,    799]))
test: (array([0, 1]), array([59533,   208]))
train: (array([0, 1]), array([238147,    814]))
test: (array([0, 1]), array([59547,   193]))
train: (array([0, 1]), array([238143,    818]))
test: (array([0, 1]), array([59551,   189]))
train: (array([0, 1]), array([238164,    797]))
test: (array([0, 1]), array([59530,   210]))
train: (array([0, 1]), array([238161,    800]))
test: (array([0, 1]), array([59533,   207]))
