# Changes made by kts4 / noami2
- `glove` was not available and so needed to install `mittens` instead
  - `from mittens import GloVe as glove`
- Removed `merge` from import from `keras.layers`
  - No longer available and not used.
- `set_session`, `clear_session` and `get_session` are no longer available from `keras.backend.tensorflow_backend`
  - Loaded from `tf.compat.v1.keras.backend` instead
- `reset_keras()` function was missing. 
  - Copied over from nb7
- Removed non-required variable `type_of_ner` and hardcoded value instead
- Updated `avg_ner_model` to set appropriate dimensions for ablations
- Allow user to select which embedding techniques to calculate
- Ensured same results as paper
   - Removed unused calls to use `LSTM` models.
   - Calculated using `unit_size` of 256 only
- `tf.contrib.layers.l2_regularizer` no longer available
  - Used `tf.keras.regularizers.l2` instead
- `tf.contrib.layers.xavier_initializer` no longer available
  - Used `tf.keras.initializers.GlorotUniform`
- Needed to add function for `mean` so that the pickle files could be loaded
  - Otherwise got error
- `iter_num` was set to `2` rather than `11`
  - Need it at `11` so that there are 10 runs of the code
- Bug: `ner_word2vec` was being used to select embeddings for instead of `embed_dict`
- Changes to newlines / spacings / printing etc.

In [None]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
from mittens import GloVe as glove

import collections
import gc 

import keras
from keras import backend as K
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Flatten, Dense, Dropout, Input, concatenate, Activation, Concatenate, GRU
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Conv1D, BatchNormalization, GRU, Convolution1D
from keras.layers import UpSampling1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D,MaxPool1D

from keras.optimizers import Adam

from keras.callbacks import EarlyStopping, ModelCheckpoint, History, ReduceLROnPlateau
from keras.utils import np_utils
from tensorflow.compat.v1.keras.backend import set_session, clear_session, get_session
import tensorflow as tf


from sklearn.utils import class_weight
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
run_word2vec     = True
run_fastText     = True
run_combined     = True
run_blueBERT     = False
run_clinicalBERT = False

In [None]:
# Reset Keras Session
def reset_keras(model):
    sess = get_session()
    clear_session()
    sess.close()
    sess = get_session()

    try:
        del model # this is from global space - change this as you need
    except:
        pass

    gc.collect() # if it's done something you should see a number being outputted

def create_dataset(dict_of_ner):
    temp_data = []
    for k, v in sorted(dict_of_ner.items()):
        temp = []
        for embed in v:
            temp.append(embed)
        temp_data.append(np.mean(temp, axis = 0)) 
    return np.asarray(temp_data)

def make_prediction_multi_avg(model, test_data):
    probs = model.predict(test_data)
    y_pred = [1 if i>=0.5 else 0 for i in probs]
    return probs, y_pred

def save_scores_multi_avg(predictions, probs, ground_truth,                           
                          embed_name, problem_type, iteration, hidden_unit_size,                          
                          sequence_name):
    
    auc   = roc_auc_score(ground_truth, probs)
    auprc = average_precision_score(ground_truth, probs)
    acc   = accuracy_score(ground_truth, predictions)
    F1    = f1_score(ground_truth, predictions)
    
    result_dict          = {}    
    result_dict['auc']   = auc
    result_dict['auprc'] = auprc
    result_dict['acc']   = acc
    result_dict['F1']    = F1
    
    result_path = "results/08-multimodal/"
    file_name = str(sequence_name)+"-"+str(hidden_unit_size)+"-"+embed_name
    file_name = file_name +"-"+problem_type+"-"+str(iteration)+"-new-avg-.p"
    pd.to_pickle(result_dict, os.path.join(result_path, file_name))
    
def avg_ner_model(layer_name, number_of_unit, embedding_name):

    if embedding_name == "concat":
        input_dimension = 200
    elif embedding_name == "clinicalBERT":
        input_dimension = 768
    elif embedding_name == "blueBERT":
        input_dimension = 768
    else:
        input_dimension = 100

    sequence_input = Input(shape=(24,104))

    input_avg = Input(shape=(input_dimension, ), name = "avg")  
    
    x = GRU(number_of_unit)(sequence_input)

    x = keras.layers.Concatenate()([x, input_avg])

    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    
    
    logits_regularizer = tf.keras.regularizers.L2(l2=0.01)
    
    preds = Dense(1, 
                  activation='sigmoid',
                  use_bias=False,
                  kernel_initializer=tf.keras.initializers.GlorotUniform(), 
                  kernel_regularizer=logits_regularizer
                 )(x)
    
    
    opt = Adam(lr=0.001, decay = 0.01)
    model = Model(inputs=[sequence_input, input_avg], outputs=preds)
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    
    return model

In [None]:
def mean(a):
    return sum(a) / len(a)

In [None]:
x_train = pd.read_pickle("data/new_x_train.pkl")
x_dev   = pd.read_pickle("data/new_x_dev.pkl")
x_test  = pd.read_pickle("data/new_x_test.pkl")

y_train = pd.read_pickle("data/new_y_train.pkl")
y_dev   = pd.read_pickle("data/new_y_dev.pkl")
y_test  = pd.read_pickle("data/new_y_test.pkl")

if run_word2vec:     ner_word2vec     = pd.read_pickle("data/new_ner_word2vec_limited_dict.pkl")
if run_fastText:     ner_fasttext     = pd.read_pickle("data/new_ner_fasttext_limited_dict.pkl")
if run_combined:     ner_concat       = pd.read_pickle("data/new_ner_combined_limited_dict.pkl")
if run_clinicalBERT: ner_clinicalBERT = pd.read_pickle("data/new_ner_clinicalbert_limited_dict.pkl")
if run_blueBERT:     ner_blueBERT     = pd.read_pickle("data/new_ner_bluebert_limited_dict.pkl")

train_ids = pd.read_pickle("data/new_train_ids.pkl")
dev_ids   = pd.read_pickle("data/new_dev_ids.pkl")
test_ids  = pd.read_pickle("data/new_test_ids.pkl")

In [None]:
embedding_types = []
embedding_dict  = []

if run_word2vec:
    embedding_types.append('word2vec')
    embedding_dict.append(ner_word2vec)
if run_word2vec:
    embedding_types.append('fasttext')
    embedding_dict.append(ner_fasttext)
if run_word2vec:
    embedding_types.append('concat')
    embedding_dict.append(ner_concat)
if run_word2vec:
    embedding_types.append('clinicalBERT')
    embedding_dict.append(ner_clinicalBERT)
if run_word2vec:
    embedding_types.append('blueBERT')
    embedding_dict.append(ner_blueBERT)


target_problems = ['mort_hosp', 'mort_icu', 'los_3', 'los_7']

num_epoch        = 100
model_patience   = 5
monitor_criteria = 'val_loss'
batch_size       = 64
iter_num         = 11
unit_sizes       = [256]
layers           = ["GRU"]

for each_layer in layers:
    for each_unit_size in unit_sizes:
        for embed_dict, embed_name in zip(embedding_dict, embedding_types): 

            temp_train_ner = dict((k, embed_dict[k]) for k in train_ids)
            temp_dev_ner   = dict((k, embed_dict[k]) for k in dev_ids)
            temp_test_ner  = dict((k, embed_dict[k]) for k in test_ids)

            x_train_ner = create_dataset(temp_train_ner)
            x_dev_ner   = create_dataset(temp_dev_ner)
            x_test_ner  = create_dataset(temp_test_ner)

            for iteration in tqdm(range(1, iter_num)):
                for each_problem in target_problems: 
                    
                    print ("Layer: ", each_layer) 
                    print ("Hidden unit: ", each_unit_size) 
                    print ("Embedding: ", embed_name)
                    print ("Iteration number: ", iteration)
                    print ("Problem type: ", each_problem)
                    print ("__________________")

                    early_stopping_monitor = EarlyStopping(monitor=monitor_criteria, 
                                                           patience=model_patience)
                    
                    best_model_name = "results/Best Models/avg-"+str(embed_name)+"-"+str(each_problem)+"-best_model.hdf5"
                    
                    checkpoint = ModelCheckpoint(best_model_name, 
                                                 monitor='val_loss', 
                                                 verbose=0,
                                                 save_best_only=True, 
                                                 mode='min', 
                                                 save_freq='epoch',
                                                 period=1)


                    callbacks = [early_stopping_monitor, checkpoint]

                    model = avg_ner_model(each_layer, 
                                          each_unit_size, 
                                          embed_name)
                    
                    model.fit([x_train, x_train_ner], 
                              y_train[each_problem], 
                              epochs=num_epoch, 
                              verbose=0, 
                              validation_data=([x_dev, x_dev_ner], y_dev[each_problem]), 
                              callbacks=callbacks, 
                              batch_size=batch_size )

                    model.load_weights(best_model_name)

                    probs, predictions = make_prediction_multi_avg(model, [x_test, x_test_ner])
                    
                    save_scores_multi_avg(predictions, 
                                          probs, 
                                          y_test[each_problem], 
                                          embed_name, 
                                          each_problem, 
                                          iteration, 
                                          each_unit_size, 
                                          each_layer)
                    
                    reset_keras(model)
                    clear_session()
                    gc.collect()