In [None]:
from EHRDataloader import EHRdataFromPickles, EHRdataloader

## 1. Load Dataset

This part of the code loads the dataset, we use the EHRDataLoader.py
The initial code could be found: https://github.com/ZhiGroup/pytorch_ehr


In [None]:
print('1 file found. Data will be split into train, validation and test.')
data = EHRdataFromPickles(root_dir = '../data/', 
                      file = 'toy.train', 
                      sort= False,
                      test_ratio = 0.2, 
                      valid_ratio = 0.1,
                      model='RNN') #No sort before splitting

# Dataloader splits
train, test, valid = data.__splitdata__()



In [None]:
## Get the patients labels, where 1: heart failure and 0: no heart failure
labels=[]
for ii in range(len(train)):
    label=train[ii][1]
    labels.append(label)

In [None]:
## Distribution of the labels
from collections import Counter
Counter(labels)

## 2. Sample of dataset

In [None]:
## example dataset for patient 0 and visit 0
patient=0
visit=0
print("Patient ID:", train[patient][0])
print("Heart Failure:", train[patient][1])
print("# of visits:", len(train[patient][2]))

print(f' list of visit_time (since last time): {train[patient][2][visit][0]}')
print(f' list of codes corresponding to visit: {train[patient][2][visit][1]}')




# 3. Preprocess Data for Training

This part of the code transforms the data which has the format described above

In [None]:
from tqdm import tqdm

In [None]:
batch_size=32
pack_pad = True

In [None]:
## Understand EHRdataloader
train_mbs = list(tqdm(EHRdataloader(train, batch_size = batch_size, packPadMode = pack_pad)))
print (' creating the list of valid minibatches')
valid_mbs = list(tqdm(EHRdataloader(valid, batch_size = batch_size, packPadMode = pack_pad)))
print (' creating the list of test minibatches')
test_mbs = list(tqdm(EHRdataloader(test, batch_size = batch_size, packPadMode = pack_pad)))

In [None]:
!pip install tensorflow

# Data.py

In [None]:
import pandas as pd
import numpy as np


class DataSet(object):
    def __init__(self, dynamic_features, labels, last_features):
        self._dynamic_features = dynamic_features
        self._labels = labels
        self._last_features = last_features
        self._num_examples = labels.shape[0] 
        self._epoch_completed = 0
        self._batch_completed = 0
        self._index_in_epoch = 0

    def next_batch(self, batch_size):
        if batch_size > self.num_examples or batch_size <= 0:
            # raise ValueError('The size of one batch: {} should be less than the total number of '
            #                  'data: {}'.format(batch_size, self.num_examples))
            batch_size = self._labels.shape[0]
        if self._batch_completed == 0:
            self._shuffle()
        self._batch_completed += 1
        start = self._index_in_epoch
        if start + batch_size >= self.num_examples:
            self._epoch_completed += 1
            feature_rest_part = self._dynamic_features[start:self._num_examples]
            label_rest_part = self._labels[start:self._num_examples]
            last_feature_rest_part = self._last_features[start:self._num_examples]

            self._shuffle()  
            self._index_in_epoch = 0
            return feature_rest_part, label_rest_part, last_feature_rest_part
        else:
            self._index_in_epoch += batch_size
            end = self._index_in_epoch
            return self._dynamic_features[start:end], self._labels[start:end], self._last_features[start:end]

    def _shuffle(self):
        index = np.arange(self._num_examples)
        np.random.shuffle(index)
        self._dynamic_features = self._dynamic_features[index]
        self._labels = self._labels[index]
        self._last_features = self._last_features[index]

    @property
    def dynamic_features(self):
        return self._dynamic_features

    @property
    def labels(self):
        return self._labels

    @property
    def last_features(self):
        return self._last_features

    @property
    def num_examples(self):
        return self._num_examples

    @property
    def epoch_completed(self):
        return self._epoch_completed

    @property
    def batch_completed(self):
        return self._batch_completed

    @epoch_completed.setter
    def epoch_completed(self, value):
        self._epoch_completed = value


def read_data(max_times_of_visits=20):
    dataset_label = pd.read_csv("resources/preprocessed_label.csv", encoding='gbk')
    patient_id_list_repeat = dataset_label.iloc[:, 0] 
    patient_id_list = []
    [patient_id_list.append(i) for i in patient_id_list_repeat if not i in patient_id_list]

    dataset_feature = pd.read_csv("resources/preprocessed_feature.csv", encoding='gbk')
    labels_on_visits = []
    features_on_visit = []
    features_of_last = []
    for patient_id in patient_id_list:
        one_visit_labels = dataset_label.loc[dataset_label['patient_id'] == patient_id].iloc[:, 2:].as_matrix()
        one_visit_feature = dataset_feature.loc[dataset_feature['patient_id'] == patient_id].iloc[:, 3:].as_matrix()
       
        if max_times_of_visits < one_visit_labels.shape[0] - 1:
            labels_on_visits.append(one_visit_labels[max_times_of_visits - 1])  
            features_on_visit.append(one_visit_feature[0:max_times_of_visits])  
            features_of_last.append(one_visit_feature[max_times_of_visits])  
        else:
            labels_on_visits.append(one_visit_labels[one_visit_labels.shape[0] - 2]) 
            features_on_visit.append(one_visit_feature[0:one_visit_labels.shape[0] - 1]) 
            features_of_last.append(one_visit_feature[one_visit_feature.shape[0] - 1])  

    all_labels_on_visits = np.array(labels_on_visits)
   
    binary_labels_of_revisits = np.expand_dims(np.sign(np.sum(all_labels_on_visits[:, 5 + 0 * 11:10 + 0 * 11], 1)), 1)
    # binary_labels_of_revisits = np.expand_dims(np.sign(np.sum(all_labels_on_visits[:, 10 + 0 * 11:11 + 0 * 11], 1)), 1)
    # labels_of_revisits = all_labels_on_visits[:, 5:11]
    # labels_of_revisits = np.c_[labels_of_revisits, binary_labels_of_revisits]
    features_on_visit = list(
        map(lambda x: np.pad(x, ((0, max_times_of_visits - x.shape[0]), (0, 0)), 'constant', constant_values=0),
            features_on_visit))
    features_on_visit = np.stack(features_on_visit)
    features_of_last = np.stack(features_of_last)
    return DataSet(features_on_visit, binary_labels_of_revisits, features_of_last)


if __name__ == "__main__":
    x = read_data(7)
    print("ok")

# Model.py

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_auc_score
import time


class LstmGanModel(object):
    def __init__(self, num_features, time_steps,
                 lstm_size=200, n_output=1, batch_size=64,
                 epochs=1000,
                 output_n_epoch=10,
                 learning_rate=0.01, max_loss=0.5, max_pace=0.01, lasso=0.0, ridge=0.0,
                 optimizer='adam', name='LSTM-GAN'):
        self._num_features = num_features
        self._epochs = epochs
        self._name = name
        self._batch_size = batch_size
        self._output_n_epoch = output_n_epoch
        self._lstm_size = lstm_size
        self._n_output = n_output
        self._time_steps = time_steps
        self._max_loss = max_loss
        self._max_pace = max_pace
        self._lasso = lasso
        self._ridge = ridge
        self._optimizer = optimizer

        print("learning_rate=", learning_rate, "max_loss=", max_loss, "max_pace=", max_pace, "lasso=", lasso, "ridge=",
              ridge)

        self._graph_definition()

    def _graph_definition(self):
        self._placeholder_definition() 
        self._sess = tf.Session() 
        with tf.variable_scope('generator'):
            self._hidden_layer() 
            self._output = tf.contrib.layers.fully_connected(self._hidden_rep, self._n_output,
                                                             activation_fn=tf.identity)
            self._pred = tf.nn.sigmoid(self._output, name="pred") 
            self._hidden_state_decoder() 
   

        with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE):
            self._fake_logits, self._fake_pred = self._hidden_layer_of_discriminator(self._predicted_last_x)
            self._real_logits, self._real_pred = self._hidden_layer_of_discriminator(self._last_x)

        self._gen_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self._fake_logits,
                                                                                labels=tf.ones_like(self._fake_logits)))
        fake_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=self._fake_logits, labels=tf.zeros_like(self._fake_logits)))
        real_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self._real_logits,
                                                                           labels=tf.ones_like(self._real_logits)))
        self._loss_of_discriminator = tf.add(fake_loss, real_loss)

        self._loss()
        self._loss_regulation()

        train_vars = tf.trainable_variables()

        gen_vars = [var for var in train_vars if var.name.startswith('generator')]

        dis_vars = [var for var in train_vars if var.name.startswith('discriminator')]
        self._generator_train_op = self._optimizer.minimize(self._loss_of_whole_generator, var_list=gen_vars)  # 定义训练
        self._discriminator_train_op = self._optimizer.minimize(self._loss_of_discriminator, var_list=dis_vars)

    def _hidden_layer_of_discriminator(self, samples):
        logits = tf.layers.dense(samples, 1)
        pred = tf.nn.sigmoid(logits)
        return logits, pred

    def _placeholder_definition(self):
        self._x = tf.placeholder(tf.float32, [None, self._time_steps, self._num_features], 'input')
        self._y = tf.placeholder(tf.float32, [None, self._n_output], 'label')
        self._last_x = tf.placeholder(tf.float32, [None, self._num_features], "true_last_x")
        self._keep_prob = tf.placeholder(tf.float32)

    def _hidden_layer(self):
        lstm = tf.contrib.rnn.BasicLSTMCell(self._lstm_size)
        init_state = lstm.zero_state(tf.shape(self._x)[0], tf.float32)  # 全零向量

        mask, length = self._length() 
        self._hidden, self._final_state = tf.nn.dynamic_rnn(lstm,
                                                            self._x,
                                                            sequence_length=length,
                                                            initial_state=init_state)
        self._hidden_rep = self._final_state.h

    def _hidden_state_decoder(self):
        # W_decoders = tf.Variable(xavier_init(self._n_output, self._lstm_size, self._num_features))
        # b_decoders = tf.Variable(tf.zeros(self._n_output))
        # decoders = tf.keras.backend.dot(self._hidden_rep, W_decoders) + tf.tile(tf.expand_dims(b_decoders, 1),
        #                                                                         [1, self._num_features])
        # self._predicted_last_x = tf.reshape(tf.matmul(tf.expand_dims(self._pred, 1), decoders),
        #                                     [-1, self._num_features])
        self._predicted_last_x = tf.contrib.layers.fully_connected(self._hidden_rep, self._num_features,
                                                                   activation_fn=tf.identity)

    def _loss(self):
        self._loss_classification = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=self._y, logits=self._output))
        self._loss_of_whole_generator = self._loss_classification + 1*self._gen_loss

    def _loss_regulation(self):
        if self._lasso != 0:
            for trainable_variables in tf.trainable_variables("generator"):
                self._loss_of_whole_generator += tf.contrib.layers.l1_regularizer(self._lasso)(trainable_variables)
            for trainable_variables in tf.trainable_variables("discriminator"):
                self._loss_of_discriminator += tf.contrib.layers.l1_regularizer(self._lasso)(trainable_variables)
        if self._ridge != 0:
            for trainable_variables in tf.trainable_variables("generator"):
                self._loss_of_whole_generator += tf.contrib.layers.l2_regularizer(self._ridge)(trainable_variables)
            for trainable_variables in tf.trainable_variables("discriminator"):
                self._loss_of_discriminator += tf.contrib.layers.l2_regularizer(self._ridge)(trainable_variables)

    def _length(self):
        mask = tf.sign(tf.reduce_max(tf.abs(self._x), 2))  
        length = tf.reduce_sum(mask, 1)  
        length = tf.cast(length, tf.int32)  
        return mask, length

    def _train_single_batch(self, dynamic_features, labels, last_features):
        self._sess.run(self._generator_train_op, feed_dict={self._x: dynamic_features,
                                                            self._y: labels,
                                                            self._last_x: last_features})
        self._sess.run(self._discriminator_train_op, feed_dict={self._x: dynamic_features,
                                                                self._y: labels,
                                                                self._last_x: last_features})

    def _loss_on_training_set(self, data_set):
        return self._sess.run(
            (self._loss_of_whole_generator, self._loss_classification, self._gen_loss, self._loss_of_discriminator),
            feed_dict={self._x: data_set.dynamic_features,
                       self._y: data_set.labels,
                       self._last_x: data_set.last_features})

    def fit(self, train_set, test_set):
        self._sess.run(tf.global_variables_initializer())
        train_set.epoch_completed = 0

        for c in tf.trainable_variables(self._name):
            print(c.name)

        print("auc\tepoch\tloss\tloss_diff\tloss_classification\tloss_generator\tloss_discriminator")
        logged = set()
        loss = 0
        while train_set.epoch_completed < self._epochs:
            dynamic_features, labels, last_features = train_set.next_batch(self._batch_size)

            if train_set.batch_completed == 1:
                loss = self.show_training(train_set, test_set, loss)  

            self._train_single_batch(dynamic_features, labels, last_features)
            if train_set.epoch_completed != 0 and train_set.epoch_completed % self._output_n_epoch == 0 and train_set.epoch_completed not in logged:
                logged.add(train_set.epoch_completed)
                loss = self.show_training(train_set, test_set, loss)

    def predict(self, test_set):
        return np.expand_dims(
            self._sess.run(self._pred, feed_dict={self._x: test_set.dynamic_features, self._keep_prob: 1})[:, -1],
            1)

    def show_training(self, train_set, test_set, loss):
        loss_prev = loss
        loss, loss_classification, loss_generator, loss_discriminator = self._loss_on_training_set(train_set)
        loss_diff = loss_prev - loss
        y_score = self.predict(test_set)  
        auc = roc_auc_score(test_set.labels[:, -1], y_score)
        print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(auc, train_set.epoch_completed, loss, loss_diff,
                                                      loss_classification, loss_generator, loss_discriminator,
                                                      time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        return loss

    @property
    def name(self):
        return self._name

    def close(self):
        self._sess.close()
        tf.reset_default_graph()

# Evaluation.py

In [None]:
import csv
import os

import sklearn
import time

import numpy as np
import xlwt
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score, roc_curve  # roc计算曲线
from sklearn.model_selection import StratifiedShuffleSplit  
import tensorflow as tf
from models import LstmGanModel
from data import read_data, DataSet


class ExperimentSetup(object):
    kfold = 5
    batch_size = 64
    encoder_size = 50
    lstm_size = 128
    learning_rate = 0.001
    epochs = 10
    output_n_epochs = 1
    max_times_of_visits = 3
    ridge_l2 = 0.001


def evaluate(test_index, y_label, y_score, file_name):
    """

    :param test_index
    :param y_label: 测试样本的真实标签 true label of test-set
    :param y_score: 测试样本的预测概率 predicted probability of test-set
    :param file_name: 输出文件路径    path of output file
    """
    # TODO 
    wb = xlwt.Workbook(file_name + '.xls')
    table = wb.add_sheet('Sheet1')
    table_title = ["test_index", "label", "prob", "pre", " ", "fpr", "tpr", "thresholds", " ",
                   "acc", "auc", "recall", "precision", "f1-score", "threshold"]
    for i in range(len(table_title)):
        table.write(0, i, table_title[i])

    auc = roc_auc_score(y_label, y_score)

    fpr, tpr, thresholds = roc_curve(y_label, y_score, pos_label=1)
    threshold = thresholds[np.argmax(tpr - fpr)]

    for i in range(len(fpr)):
        table.write(i + 1, table_title.index("fpr"), fpr[i])
        table.write(i + 1, table_title.index("tpr"), tpr[i])
        table.write(i + 1, table_title.index("thresholds"), float(thresholds[i]))
    table.write(1, table_title.index("threshold"), float(threshold))

    y_pred_label = (y_score >= threshold) * 1
    acc = accuracy_score(y_label, y_pred_label)
    recall = recall_score(y_label, y_pred_label)
    precision = precision_score(y_label, y_pred_label)
    f1 = f1_score(y_label, y_pred_label)

    for i in range(len(test_index)):
        table.write(i + 1, table_title.index("test_index"), int(test_index[i]))
        table.write(i + 1, table_title.index("label"), int(y_label[i]))
        table.write(i + 1, table_title.index("prob"), float(y_score[i]))
        table.write(i + 1, table_title.index("pre"), int(y_pred_label[i]))

    # write metrics
    table.write(1, table_title.index("auc"), float(auc))
    table.write(1, table_title.index("acc"), float(acc))
    table.write(1, table_title.index("recall"), float(recall))
    table.write(1, table_title.index("precision"), float(precision))
    table.write(1, table_title.index("f1-score"), float(f1))

    wb.save(file_name + ".xls")


def model_experiments(model, data_set, result_file):
    dynamic_features = data_set.dynamic_features
    labels = data_set.labels
    last_features = data_set.last_features
    kf = sklearn.model_selection.StratifiedKFold(n_splits=ExperimentSetup.kfold, shuffle=False)

    # n_output = labels.shape[1]  # classes
    n_output = 1  # classes

    tol_test_idx = np.zeros(0, dtype=np.int32)
    tol_pred = np.zeros(shape=(0, n_output))
    tol_label = np.zeros(shape=(0, n_output), dtype=np.int32)
    i = 1
    for train_idx, test_idx in kf.split(X=data_set.dynamic_features, y=data_set.labels[:, -1]):
        train_dynamic = dynamic_features[train_idx]
        train_y = labels[train_idx]
        train_last_features = last_features[train_idx]
        train_set = DataSet(train_dynamic, train_y, train_last_features)

        test_dynamic = dynamic_features[test_idx]
        test_y = labels[test_idx]
        test_last_features = last_features[test_idx]
        test_set = DataSet(test_dynamic, test_y, test_last_features)
        print("learning_rate = ", ExperimentSetup.learning_rate)
        model.fit(train_set, test_set)

        y_score = model.predict(test_set)
        tol_test_idx = np.concatenate((tol_test_idx, test_idx))
        tol_pred = np.vstack((tol_pred, y_score))
        tol_label = np.vstack((tol_label, np.expand_dims(test_y[:, -1], 1)))
        print("Cross validation: {} of {}".format(i, ExperimentSetup.kfold),
              time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        i += 1
        # evaluate(test_y, y_score, result_file)

    model.close()
    # with open(result_file, 'a', newline='') as csv_file:
    #     f_writer = csv.writer(csv_file, delimiter=',')
    #     f_writer.writerow([])
    # return evaluate(tol_label, tol_pred, result_file)
    return evaluate(tol_test_idx, tol_label, tol_pred, result_file)


def denoising_auto_encoder_lstm_decoder_model_experiments(result_file):
    data_set = toy.train
    dynamic_features = data_set.dynamic_features
    labels = data_set.labels

    num_features = dynamic_features.shape[2]
    time_steps = dynamic_features.shape[1]
    n_output = labels.shape[1]

    model = LstmGanModel(num_features=num_features,
                         time_steps=time_steps,
                         lstm_size=ExperimentSetup.lstm_size,
                         n_output=n_output,
                         batch_size=ExperimentSetup.batch_size,
                         epochs=ExperimentSetup.epochs,
                         output_n_epoch=ExperimentSetup.output_n_epochs,
                         learning_rate=ExperimentSetup.learning_rate,
                         ridge=ExperimentSetup.ridge_l2,
                         optimizer=tf.train.AdamOptimizer(ExperimentSetup.learning_rate))
    return model_experiments(model, data_set, result_file)


if __name__ == '__main__':
    # basic_lstm_model_experiments('resources/save/basic_lstm.csv')
    # lstm_with_static_feature_model_experiments("resources/save/lstm_with_static.csv")
    # bidirectional_lstm_model_experiments('resources/save/bidirectional_lstm.csv')
    for i_times in range(20):
        # print("mlp_bi-lstm_att")
        # bi_lstm_attention_model_experiments('result_qx/MLA1-' + str(i_times + 1), True, True)
        save_path = "heart_3mon_3times_10epoch_1gen_LR0.001_L0.001"
        # save_path = "all_cause_24mon"
        print("save to " + save_path)
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        # print("DAE-LSTM")
        # denoising_auto_encoder_lstm_model_experiments(
        #     save_path + '/DAE-LSTM-DROPOUT-50DAEsize-20epoch-0.001lr-1-' + str(i_times + 1))
        #
        print("LSTM_GAN")
        denoising_auto_encoder_lstm_decoder_model_experiments(
            save_path + '/LSTM_GAN_1-' + str(i_times + 1))