In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score

from run_readmission import *

import gensim

m = gensim.models.KeyedVectors.load('word2vec.model')
weights = (m[m.wv.vocab])

max_words_count = 44082
embedding_size = 100
max_words_length = 318

04/16/2022 13:06:39 - INFO - gensim.utils -   loading Word2VecKeyedVectors object from word2vec.model
04/16/2022 13:06:40 - INFO - gensim.utils -   loading wv recursively from word2vec.model.wv.* with mmap=None
04/16/2022 13:06:40 - INFO - gensim.utils -   setting ignored attribute vectors_norm to None
04/16/2022 13:06:40 - INFO - gensim.utils -   loading vocabulary recursively from word2vec.model.vocabulary.* with mmap=None
04/16/2022 13:06:40 - INFO - gensim.utils -   loading trainables recursively from word2vec.model.trainables.* with mmap=None
04/16/2022 13:06:40 - INFO - gensim.utils -   setting ignored attribute cum_table to None
04/16/2022 13:06:40 - INFO - gensim.utils -   loaded word2vec.model
  weights = (m[m.wv.vocab])


In [2]:
def vote_score(df, score, readmission_mode, output_dir):
    df['pred_score'] = score
    df_sort = df.sort_values(by=['ID'])
    #score 
    temp = (df_sort.groupby(['ID'])['pred_score'].agg(max)+df_sort.groupby(['ID'])['pred_score'].agg(sum)/2)/(1+df_sort.groupby(['ID'])['pred_score'].agg(len)/2)
    x = df_sort.groupby(['ID'])['Label'].agg(np.min).values
    df_out = pd.DataFrame({'logits': temp.values, 'ID': x})

    fpr, tpr, thresholds = roc_curve(x, temp.values)
    auc_score = auc(fpr, tpr)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='Val (area = {:.3f})'.format(auc_score))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    string = 'auroc_bilstm_'+readmission_mode+'.png'
    plt.savefig(os.path.join(output_dir, string))
    plt.close('all')

    return fpr, tpr, df_out
    
def pr_curve_plot(y, y_score, readmission_mode, output_dir):
    precision, recall, _ = precision_recall_curve(y, y_score)
    area = auc(recall,precision)
    step_kwargs = ({'step': 'post'}
                   if 'step' in signature(plt.fill_between).parameters
                   else {})
    
    plt.figure(2)
    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve: AUC={0:0.2f}'.format(
              area))
    
    string = 'auprc_bilstm_'+readmission_mode+'.png'
    plt.savefig(os.path.join(output_dir, string))
    plt.close('all')
    
def vote_pr_curve(df, score, readmission_mode, output_dir):
    df['pred_score'] = score
    df_sort = df.sort_values(by=['ID'])
    #score 
    temp = (df_sort.groupby(['ID'])['pred_score'].agg(max)+df_sort.groupby(['ID'])['pred_score'].agg(sum)/2)/(1+df_sort.groupby(['ID'])['pred_score'].agg(len)/2)
    y = df_sort.groupby(['ID'])['Label'].agg(np.min).values
    
    precision, recall, thres = precision_recall_curve(y, temp)
    pr_thres = pd.DataFrame(data =  list(zip(precision, recall, thres)), columns = ['prec','recall','thres'])
    vote_df = pd.DataFrame(data =  list(zip(temp, y)), columns = ['score','label'])
    
    pr_curve_plot(y, temp, readmission_mode, output_dir)
    
    temp = pr_thres[pr_thres.prec > 0.799999].reset_index()
    
    rp80 = 0
    if temp.size == 0:
        print('Test Sample too small or RP80=0')
    else:
        rp80 = temp.iloc[0].recall
        print('Recall at Precision of 80 is {}', rp80)

    return rp80

def count_parameters(model): 
    return len(model.get_weights()[0]) * len(model.get_weights())


In [3]:
timepoints = ['early', 'discharge']

for timepoint in timepoints:
    readmission_mode = 'discharge' if timepoint == 'discharge' else 'early'
    if timepoint == 'discharge':
        df_train = pd.read_csv('data/discharge/train.csv')
        df_val = pd.read_csv('data/discharge/val.csv')
        df_test = pd.read_csv('data/discharge/test.csv')
    else: 
        df_train = pd.read_csv('data/3days/train.csv')
        df_val = pd.read_csv('data/3days/val.csv')
        df_test = pd.read_csv('data/2days/test.csv').append(pd.read_csv('data/3days/test.csv'))
    sent_train = df_train['TEXT']
    y_train = df_train['Label']
    sent_val = df_val['TEXT']
    y_val = df_val['Label']
    sent_test = df_test['TEXT']
    y_test = df_test['Label']

    tokenizer = Tokenizer(num_words=max_words_count)
    tokenizer.fit_on_texts(sent_train)
    tokens_train = tokenizer.texts_to_sequences(sent_train)
    tokens_val = tokenizer.texts_to_sequences(sent_val)
    tokens_test = tokenizer.texts_to_sequences(sent_test)

    x_train = pad_sequences(tokens_train, maxlen=max_words_length)
    x_val = pad_sequences(tokens_val, maxlen=max_words_length)
    x_test = pad_sequences(tokens_test, maxlen=max_words_length)

    word_idx = tokenizer.word_index
    embed_dict = dict(zip(list(m.wv.vocab), list(m[m.wv.vocab])))
    all_embs = np.stack(embed_dict.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words_count, embedding_size))
    for word, j in word_idx.items():
        if j < max_words_count:
            vec_temp = embed_dict.get(word)
            if vec_temp is not None:
                embedding_matrix[j] = vec_temp
    
    if os.path.exists('./bilstm_models/best_model_' + readmission_mode + '.h5'):
        print('Pre-trained Bi-LSTM model found. Loading model from .h5 file.')
        model = load_model('./bilstm_models/best_model_' + readmission_mode + '.h5')
    else:
        print('Pre-trained Bi-LSTM model not found. Training new model.')
        inp = Input(shape=(max_words_length,))
        x = Embedding(max_words_count, embedding_size, weights=[embedding_matrix])(inp)
        x = Bidirectional(LSTM(embedding_size, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = GlobalMaxPool1D()(x)
        x = Dense(50, activation="relu")(x)
        x = Dropout(0.1)(x)
        x = Dense(1, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['mae', 'accuracy'])
        save_path = './bilstm_models/best_model_' + readmission_mode + '.h5'
        callbacks = [EarlyStopping(monitor='val_loss', patience=2),
                     ModelCheckpoint(filepath=save_path, monitor='val_loss', save_best_only=True)]
        history = model.fit(x_train, y_train, batch_size=64, epochs=3, callbacks=callbacks, verbose=1,
                            validation_data=(x_val, y_val))

    print('Training completed. Number of parameters: ' + str(count_parameters(model)))
    y_train_preds = model.predict(x_train)
    y_test_preds = model.predict(x_test)
    y_test_actual = df_test['Label']

    output_dir = './results/bilstm/result_bilstm_' + readmission_mode

    fpr, tpr, df_out = vote_score(df_test, y_test_preds, readmission_mode, output_dir)
    rp80 = vote_pr_curve(df_test, y_test_preds, readmission_mode, output_dir)

    y_test_pred = [1 if i > 0.5 else 0 for i in y_test_preds]
    result = {'eval_loss': 'N/A',
              'eval_accuracy': str(accuracy_score(y_test_actual, y_test_pred)),                 
              'global_step': 'N/A',
              'training loss': 'N/A',
              'RP80': rp80}
    output_eval_file = os.path.join(output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    print('Completed evaluation of baseline Bi-LSTM model for readmission task: ' + readmission_mode)

  embed_dict = dict(zip(list(m.wv.vocab), list(m[m.wv.vocab])))
  if (await self.run_code(code, result,  async_=asy)):


Pre-trained Bi-LSTM model found. Loading model from .h5 file.
Training completed. Number of parameters: 484902


04/16/2022 13:11:02 - INFO - run_readmission -   ***** Eval results *****
04/16/2022 13:11:02 - INFO - run_readmission -     RP80 = 0.23826714801444043
04/16/2022 13:11:02 - INFO - run_readmission -     eval_accuracy = 0.6061084781463928
04/16/2022 13:11:02 - INFO - run_readmission -     eval_loss = N/A
04/16/2022 13:11:02 - INFO - run_readmission -     global_step = N/A
04/16/2022 13:11:02 - INFO - run_readmission -     training loss = N/A


Recall at Precision of 80 is {} 0.23826714801444043
Completed evaluation of baseline Bi-LSTM model for readmission task: early


  embed_dict = dict(zip(list(m.wv.vocab), list(m[m.wv.vocab])))
  if (await self.run_code(code, result,  async_=asy)):


Pre-trained Bi-LSTM model found. Loading model from .h5 file.
Training completed. Number of parameters: 484902


04/16/2022 13:13:31 - INFO - run_readmission -   ***** Eval results *****
04/16/2022 13:13:31 - INFO - run_readmission -     RP80 = 0.15517241379310345
04/16/2022 13:13:31 - INFO - run_readmission -     eval_accuracy = 0.6160626836434868
04/16/2022 13:13:31 - INFO - run_readmission -     eval_loss = N/A
04/16/2022 13:13:31 - INFO - run_readmission -     global_step = N/A
04/16/2022 13:13:31 - INFO - run_readmission -     training loss = N/A


Recall at Precision of 80 is {} 0.15517241379310345
Completed evaluation of baseline Bi-LSTM model for readmission task: discharge
