# Changes made by kts4 / noami2
- `glove` was not available and so needed to install `mittens` instead
  - `from mittens import GloVe as glove`
- Removed `merge` from import from `keras.layers`
  - No longer available.
  - Used `Concatenate()([x, text_embeddings])` instead of `merge([x, text_embeddings], mode='concat', concat_axis=1)`
  - Code was available already, just commented out.
- `set_session`, `clear_session` and `get_session` are no longer available from `keras.backend.tensorflow_backend`
  - Loaded from `tf.compat.v1.keras.backend` and `tf.keras.backend` instead
- Imported `Path`
  - Allows us to ensure the required directories for saving are available
- We do not have the `FastText` model and so all references to it and combined model have to be commented out
- `tf.contrib.layers.l2_regularizer` no longer available
  - Used `tf.keras.regularizers.l2` instead
- `tf.contrib.layers.xavier_initializer` no longer available
  - Used `tf.keras.initializers.GlorotUniform` instead
- Changed `x_train_ner = np.asarray(x_train_dict_sorted.values())` to `x_train_ner = np.array(list(x_train_dict_sorted.values()))`
  - Change due to Python 3
- Changes to newlines / spacings / printing etc.

In [4]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, FastText
from mittens import GloVe as glove

import collections
import gc 

import keras
from keras import backend as K
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Flatten, Dense, Dropout, Input, concatenate, Activation, Concatenate, GRU
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Conv1D, BatchNormalization, GRU, Convolution1D
from keras.layers import UpSampling1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D,MaxPool1D

from keras.optimizers import Adam

from keras.callbacks import EarlyStopping, ModelCheckpoint, History, ReduceLROnPlateau
from keras.utils import np_utils
from tensorflow.compat.v1.keras.backend import set_session, clear_session, get_session
import tensorflow as tf


from sklearn.utils import class_weight
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

from pathlib import Path
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [6]:
x_train = pd.read_pickle("data/new_x_train.pkl")
x_dev   = pd.read_pickle("data/new_x_dev.pkl")
x_test  = pd.read_pickle("data/new_x_test.pkl")

y_train = pd.read_pickle("data/new_y_train.pkl")
y_dev   = pd.read_pickle("data/new_y_dev.pkl")
y_test  = pd.read_pickle("data/new_y_test.pkl")


ner_word2vec     = pd.read_pickle("data/new_ner_word2vec_limited_dict.pkl")
ner_fasttext     = pd.read_pickle("data/new_ner_fasttext_limited_dict.pkl")
ner_concat       = pd.read_pickle("data/new_ner_combined_limited_dict.pkl")
# ner_clinicalBERT = pd.read_pickle("data/new_ner_clinicalbert_limited_dict.pkl")
# ner_blueBERT      = pd.read_pickle("data/new_ner_bluebert_limited_dict.pkl")

train_ids = pd.read_pickle("data/new_train_ids.pkl")
dev_ids   = pd.read_pickle("data/new_dev_ids.pkl")
test_ids  = pd.read_pickle("data/new_test_ids.pkl")

In [8]:
def make_prediction_cnn(model, test_data):
    probs = model.predict(test_data)
    y_pred = [1 if i>=0.5 else 0 for i in probs]
    return probs, y_pred

def save_scores_cnn(predictions, probs, ground_truth, 
                          embed_name, problem_type, iteration, hidden_unit_size,
                          sequence_name):
    
    auc   = roc_auc_score(ground_truth, probs)
    auprc = average_precision_score(ground_truth, probs)
    acc   = accuracy_score(ground_truth, predictions)
    F1    = f1_score(ground_truth, predictions)
    
    result_dict          = {}    
    result_dict['auc']   = auc
    result_dict['auprc'] = auprc
    result_dict['acc']   = acc
    result_dict['F1']    = F1

    result_path = "results/09-cnn/"
    file_name = str(sequence_name)+"-"+str(hidden_unit_size)+"-"+embed_name
    file_name = file_name +"-"+problem_type+"-"+str(iteration)+"-new-cnn-.p"
    pd.to_pickle(result_dict, os.path.join(result_path, file_name))
    
def print_scores_cnn(predictions, probs, ground_truth, model_name, problem_type, iteration, hidden_unit_size):
    auc   = roc_auc_score(ground_truth, probs)
    auprc = average_precision_score(ground_truth, probs)
    acc   = accuracy_score(ground_truth, predictions)
    F1    = f1_score(ground_truth, predictions)
    
    print ("AUC: ", auc, "AUPRC: ", auprc, "F1: ", F1)
    
def get_subvector_data(size, embed_name, data):
    if embed_name == "concat":
        vector_size = 200
    elif embed_name == "clinicalBERT":
        vector_size = 768
    elif embed_name == "blueBERT":
        vector_size = 768
    else:
        vector_size = 100

    x_data = {}
    for k, v in data.items():
        number_of_additional_vector = len(v) - size
        vector = []
        for i in v:
            vector.append(i)
            
        if number_of_additional_vector < 0: 
            number_of_additional_vector = np.abs(number_of_additional_vector)

            temp = vector[:size]
            for i in range(0, number_of_additional_vector):
                temp.append(np.zeros(vector_size))
            x_data[k] = np.asarray(temp)
        else:
            x_data[k] = np.asarray(vector[:size])

    return x_data


def proposedmodel(layer_name, number_of_unit, embedding_name, ner_limit, num_filter):
    if embedding_name == "concat":
        input_dimension = 200
    elif embedding_name == "clinicalBERT":
        input_dimension = 768
    elif embedding_name == "blueBERT":
        input_dimension = 768
    else:
        input_dimension = 100

    sequence_input = Input(shape=(24,104))

    input_img = Input(shape=(ner_limit, input_dimension), 
                      name = "cnn_input")

    convs = []
    filter_sizes = [2,3,4]

    text_conv1d = Conv1D(filters=num_filter, 
                         kernel_size=3, 
                         padding = 'valid', 
                         strides = 1, 
                         dilation_rate=1, 
                         activation='relu', 
                         kernel_initializer=tf.keras.initializers.GlorotUniform()
                        )(input_img)
    text_conv1d = Dropout(0.2)(text_conv1d)
    
    text_conv1d = Conv1D(filters=num_filter*2, 
                         kernel_size=3, 
                         padding = 'valid', 
                         strides = 1, 
                         dilation_rate=1, 
                         activation='relu',
                         kernel_initializer=tf.keras.initializers.GlorotUniform()
                        )(text_conv1d)   
    text_conv1d = Dropout(0.2)(text_conv1d)
    
    text_conv1d = Conv1D(filters=num_filter*3, 
                         kernel_size=3, 
                         padding = 'valid', 
                         strides = 1, 
                         dilation_rate=1, 
                         activation='relu',
                         kernel_initializer=tf.keras.initializers.GlorotUniform()
                        )(text_conv1d)   
    text_conv1d = Dropout(0.2)(text_conv1d)
    
    text_conv1d = Conv1D(filters=num_filter*4, 
                         kernel_size=3, 
                         padding = 'valid', 
                         strides = 1, 
                         dilation_rate=1, 
                         activation='relu',
                         kernel_initializer=tf.keras.initializers.GlorotUniform()
                        )(text_conv1d)   
    text_conv1d = Dropout(0.2)(text_conv1d)
    
    text_conv1d = Conv1D(filters=num_filter*5, 
                         kernel_size=3, 
                         padding = 'valid', 
                         strides = 1, 
                         dilation_rate=1, 
                         activation='relu',
                         kernel_initializer=tf.keras.initializers.GlorotUniform()
                        )(text_conv1d)   
    text_conv1d = Dropout(0.2)(text_conv1d) 

    
    text_embeddings = GlobalMaxPooling1D()(text_conv1d)
    
    x = GRU(number_of_unit)(sequence_input)
    
    concatenated = Concatenate()([x, text_embeddings])

    concatenated = Dense(512, activation='relu')(concatenated)
    concatenated = Dropout(0.2)(concatenated)
    
    logits_regularizer = tf.keras.regularizers.L2(l2=0.01)
    preds = Dense(1, 
                  activation='sigmoid',
                  use_bias=False,
                  kernel_initializer=tf.keras.initializers.GlorotUniform(), 
                  kernel_regularizer=logits_regularizer
                 )(concatenated)
    
    opt = Adam(lr=1e-3, decay = 0.01)

    model = Model(inputs=[sequence_input, input_img], 
                  outputs=preds)
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['acc'])
    
    return model


In [10]:
embedding_types = ['word2vec', 'fasttext', 'concat']#['word2vec', 'fasttext', 'concat', 'clinicalBERT', 'blueBERT']
embedding_dict  = [ner_word2vec, ner_fasttext, ner_concat]#[ner_word2vec, ner_fasttext, ner_concat, ner_clinicalBERT, ner_blueBERT]

target_problems = ['mort_hosp', 'mort_icu', 'los_3', 'los_7']

num_epoch        = 100
model_patience   = 5
monitor_criteria = 'val_loss'
batch_size       = 64

filter_number    = 32
ner_representation_limit = 64
activation_func  = "relu"

sequence_model   = "GRU"
sequence_hidden_unit = 256

maxiter = 11
for embed_dict, embed_name in zip(embedding_dict, embedding_types):  
    
    temp_train_ner = dict((k, embed_dict[k]) for k in train_ids)
    temp_dev_ner   = dict((k, embed_dict[k]) for k in dev_ids)
    temp_test_ner  = dict((k, embed_dict[k]) for k in test_ids)

    x_train_dict = {}
    x_dev_dict   = {}
    x_test_dict  = {}

    x_train_dict = get_subvector_data(ner_representation_limit, embed_name, temp_train_ner)
    x_dev_dict   = get_subvector_data(ner_representation_limit, embed_name, temp_dev_ner)
    x_test_dict  = get_subvector_data(ner_representation_limit, embed_name, temp_test_ner)

    x_train_dict_sorted = collections.OrderedDict(sorted(x_train_dict.items()))
    
    x_dev_dict_sorted   = collections.OrderedDict(sorted(x_dev_dict.items()))
    x_test_dict_sorted  = collections.OrderedDict(sorted(x_test_dict.items()))

    x_train_ner = np.asarray(list(x_train_dict_sorted.values()))
    x_dev_ner   = np.asarray(list(x_dev_dict_sorted.values()))
    x_test_ner  = np.asarray(list(x_test_dict_sorted.values()))
        
    for iteration in tqdm(range(1,maxiter)):
        for each_problem in target_problems:  
            
            print ("Embedding: ", embed_name)
            print ("Iteration number: ", iteration)
            print ("Problem type: ", each_problem)
            print ("__________________")
            
            
            early_stopping_monitor = EarlyStopping(monitor=monitor_criteria, 
                                                   patience=model_patience)
            
            best_model_name = "results/Best Models/" + str(ner_representation_limit)+"-basiccnn1d-"+str(embed_name)+"-"+str(each_problem)+"-"+"best_model.hdf5"
            
            checkpoint = ModelCheckpoint(best_model_name, 
                                         monitor=monitor_criteria, 
                                         verbose=0,
                                         save_best_only=True, 
                                         mode='min')
            
            reduce_lr = ReduceLROnPlateau(monitor=monitor_criteria, 
                                          factor=0.2,
                                          patience=2, 
                                          min_lr=0.00001, 
                                          epsilon=1e-4, 
                                          mode='min')
            

            callbacks = [early_stopping_monitor, checkpoint, reduce_lr]
            
            model = proposedmodel(sequence_model, 
                                  sequence_hidden_unit, 
                                  embed_name, 
                                  ner_representation_limit,
                                  filter_number)
            
            model.fit([x_train, x_train_ner], 
                      y_train[each_problem], 
                      epochs=num_epoch, 
                      verbose=0, 
                      validation_data=([x_dev, x_dev_ner], y_dev[each_problem]), 
                      callbacks=callbacks, 
                      batch_size=batch_size)
            
            
            probs, predictions = make_prediction_cnn(model, [x_test, x_test_ner])
            print_scores_cnn(predictions, 
                             probs, 
                             y_test[each_problem], 
                             embed_name, 
                             each_problem, 
                             iteration, 
                             sequence_hidden_unit)
            
            model.load_weights(best_model_name)
                      
            probs, predictions = make_prediction_cnn(model, [x_test, x_test_ner])
            save_scores_cnn(predictions, 
                            probs, 
                            y_test[each_problem], 
                            embed_name, 
                            each_problem, 
                            iteration,
                            sequence_hidden_unit, 
                            sequence_model 
                            )
            del model
            clear_session()
            gc.collect()
            
    

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Embedding:  word2vec
Iteration number:  1
Problem type:  mort_hosp
__________________
AUC:  0.8814710476703476 AUPRC:  0.5739049882319722 F1:  0.46731571627260077
Embedding:  word2vec
Iteration number:  1
Problem type:  mort_icu
__________________
AUC:  0.8882083904461392 AUPRC:  0.515831045422583 F1:  0.4621676891615542
Embedding:  word2vec
Iteration number:  1
Problem type:  los_3
__________________
AUC:  0.7069631961356277 AUPRC:  0.6450519319625181 F1:  0.5662828461765574
Embedding:  word2vec
Iteration number:  1
Problem type:  los_7
__________________
AUC:  0.7333899348565417 AUPRC:  0.23693684792541678 F1:  0.04289544235924933


 10%|███████▋                                                                     | 1/10 [1:06:25<9:57:49, 3985.55s/it]

Embedding:  word2vec
Iteration number:  2
Problem type:  mort_hosp
__________________
AUC:  0.880666846215998 AUPRC:  0.578469975736881 F1:  0.4790257104194858
Embedding:  word2vec
Iteration number:  2
Problem type:  mort_icu
__________________
AUC:  0.8874463752916969 AUPRC:  0.5198690362455033 F1:  0.47887323943661975
Embedding:  word2vec
Iteration number:  2
Problem type:  los_3
__________________
AUC:  0.7035289326143388 AUPRC:  0.639218362395086 F1:  0.558649289099526
Embedding:  word2vec
Iteration number:  2
Problem type:  los_7
__________________
AUC:  0.7448974174951792 AUPRC:  0.22921460920275546 F1:  0.05305039787798409


 20%|███████████████▍                                                             | 2/10 [2:05:37<8:17:22, 3730.33s/it]

Embedding:  word2vec
Iteration number:  3
Problem type:  mort_hosp
__________________
AUC:  0.8812512792889847 AUPRC:  0.5734118760876075 F1:  0.4811827956989248
Embedding:  word2vec
Iteration number:  3
Problem type:  mort_icu
__________________
AUC:  0.8822594592605539 AUPRC:  0.5100598238416777 F1:  0.4665314401622718
Embedding:  word2vec
Iteration number:  3
Problem type:  los_3
__________________
AUC:  0.7057624792597745 AUPRC:  0.6466208894978152 F1:  0.5583756345177664
Embedding:  word2vec
Iteration number:  3
Problem type:  los_7
__________________
AUC:  0.7331855213165748 AUPRC:  0.2267587684997862 F1:  0.0481283422459893


 30%|███████████████████████                                                      | 3/10 [3:05:50<7:08:59, 3677.07s/it]

Embedding:  word2vec
Iteration number:  4
Problem type:  mort_hosp
__________________
AUC:  0.8809841098841906 AUPRC:  0.5772053325048957 F1:  0.49066666666666664
Embedding:  word2vec
Iteration number:  4
Problem type:  mort_icu
__________________
AUC:  0.8854769417587371 AUPRC:  0.5109680644867606 F1:  0.45703125
Embedding:  word2vec
Iteration number:  4
Problem type:  los_3
__________________
AUC:  0.7040440930500174 AUPRC:  0.6425838381348351 F1:  0.5475469412477286
Embedding:  word2vec
Iteration number:  4
Problem type:  los_7
__________________
AUC:  0.7319852381021323 AUPRC:  0.22776222129735008 F1:  0.05804749340369393


 40%|██████████████████████████████▊                                              | 4/10 [3:41:50<5:07:47, 3077.86s/it]

Embedding:  word2vec
Iteration number:  5
Problem type:  mort_hosp
__________________
AUC:  0.8801115001346621 AUPRC:  0.5752620333762176 F1:  0.4712328767123288
Embedding:  word2vec
Iteration number:  5
Problem type:  mort_icu
__________________
AUC:  0.889382844386386 AUPRC:  0.5156510854855064 F1:  0.466403162055336
Embedding:  word2vec
Iteration number:  5
Problem type:  los_3
__________________
AUC:  0.7029813655765782 AUPRC:  0.6399748402918701 F1:  0.554958183990442
Embedding:  word2vec
Iteration number:  5
Problem type:  los_7
__________________
AUC:  0.7304795410657495 AUPRC:  0.2210615929813683 F1:  0.0374331550802139


 50%|██████████████████████████████████████▌                                      | 5/10 [4:05:31<3:26:43, 2480.61s/it]

Embedding:  word2vec
Iteration number:  6
Problem type:  mort_hosp
__________________
AUC:  0.8799644492324266 AUPRC:  0.5700489639817498 F1:  0.4772117962466488
Embedding:  word2vec
Iteration number:  6
Problem type:  mort_icu
__________________
AUC:  0.890691455924246 AUPRC:  0.5246972853653601 F1:  0.46799999999999997
Embedding:  word2vec
Iteration number:  6
Problem type:  los_3
__________________
AUC:  0.7040192131426124 AUPRC:  0.6401833206397936 F1:  0.5583482944344704
Embedding:  word2vec
Iteration number:  6
Problem type:  los_7
__________________
AUC:  0.736796022863982 AUPRC:  0.2277745161308613 F1:  0.03763440860215054


 60%|██████████████████████████████████████████████▏                              | 6/10 [5:45:53<4:05:38, 3684.58s/it]

Embedding:  word2vec
Iteration number:  7
Problem type:  mort_hosp
__________________
AUC:  0.8811494748182063 AUPRC:  0.5795812993400264 F1:  0.4925975773889637
Embedding:  word2vec
Iteration number:  7
Problem type:  mort_icu
__________________
AUC:  0.8862213247717787 AUPRC:  0.5237477329395441 F1:  0.4745098039215687
Embedding:  word2vec
Iteration number:  7
Problem type:  los_3
__________________
AUC:  0.7039663172050205 AUPRC:  0.6433534680001458 F1:  0.554325052379527
Embedding:  word2vec
Iteration number:  7
Problem type:  los_7
__________________
AUC:  0.7351414107360887 AUPRC:  0.22294348373237438 F1:  0.05774278215223097


 70%|█████████████████████████████████████████████████████▉                       | 7/10 [6:07:34<2:25:15, 2905.31s/it]

Embedding:  word2vec
Iteration number:  8
Problem type:  mort_hosp
__________________
AUC:  0.8801077295987072 AUPRC:  0.5740541626841427 F1:  0.4920212765957447
Embedding:  word2vec
Iteration number:  8
Problem type:  mort_icu
__________________
AUC:  0.8879086440423294 AUPRC:  0.5076882691125678 F1:  0.47036328871892924
Embedding:  word2vec
Iteration number:  8
Problem type:  los_3
__________________
AUC:  0.7036933699855488 AUPRC:  0.6437000447304045 F1:  0.5561209218796768
Embedding:  word2vec
Iteration number:  8
Problem type:  los_7
__________________
AUC:  0.7274674575712639 AUPRC:  0.22217135917760122 F1:  0.021798365122615803


 80%|█████████████████████████████████████████████████████████████▌               | 8/10 [6:33:35<1:22:34, 2477.40s/it]

Embedding:  word2vec
Iteration number:  9
Problem type:  mort_hosp
__________________
AUC:  0.8817242122273095 AUPRC:  0.5807161875004496 F1:  0.4922680412371134
Embedding:  word2vec
Iteration number:  9
Problem type:  mort_icu
__________________
AUC:  0.8861507962061764 AUPRC:  0.5142343388948157 F1:  0.4466800804828973
Embedding:  word2vec
Iteration number:  9
Problem type:  los_3
__________________
AUC:  0.704966322222817 AUPRC:  0.6441629669972803 F1:  0.5543672014260249
Embedding:  word2vec
Iteration number:  9
Problem type:  los_7
__________________
AUC:  0.7395923173601215 AUPRC:  0.23837772207491442 F1:  0.03814713896457765


 90%|███████████████████████████████████████████████████████████████████████        | 9/10 [6:59:42<36:32, 2192.68s/it]

Embedding:  word2vec
Iteration number:  10
Problem type:  mort_hosp
__________________
AUC:  0.882200915701589 AUPRC:  0.5821164319643676 F1:  0.48648648648648646
Embedding:  word2vec
Iteration number:  10
Problem type:  mort_icu
__________________
AUC:  0.887420310387018 AUPRC:  0.5128183167656285 F1:  0.465979381443299
Embedding:  word2vec
Iteration number:  10
Problem type:  los_3
__________________
AUC:  0.7037415617389674 AUPRC:  0.6391253925756186 F1:  0.5591461606878151
Embedding:  word2vec
Iteration number:  10
Problem type:  los_7
__________________
AUC:  0.7367146711010273 AUPRC:  0.2263525131126095 F1:  0.03763440860215054


100%|██████████████████████████████████████████████████████████████████████████████| 10/10 [7:33:30<00:00, 2721.01s/it]


KeyError: 6541