In [1]:
from itertools import islice

import math
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
import csv 
import matplotlib.pyplot as plt

import tensorflow as tf
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

from tensorflow.keras.layers import Input, Embedding, Bidirectional, Dense, LSTM, TimeDistributed, Lambda, SpatialDropout1D
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D

from tensorflow.keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop

In [2]:
train_data = pd.read_csv("../input/thesis-clean-data/train.csv") 
test_data = pd.read_csv("../input/thesis-clean-data/test.csv")
val_data = pd.read_csv("../input/thesis-clean-data/val.csv")

In [3]:
train_data.head(3)

Unnamed: 0,Full Text,UMLS Code,Entity Codes
0,"['<START>', 'DCTN4', 'as', 'a', 'modifier', 'o...","['<START>', 'U-UMLS:C4308010', 'O', 'O', 'O', ...","['<START>', 'U-T103', 'O', 'O', 'O', 'O', 'B-T..."
1,"['<START>', 'Prevascularized', 'silicon', 'mem...","['<START>', 'O', 'U-UMLS:C0037114', 'O', 'O', ...","['<START>', 'O', 'U-T103', 'O', 'O', 'O', 'O',..."
2,"['<START>', 'Seated', 'maximum', 'flexion', ':...","['<START>', 'U-UMLS:C0277814', 'O', 'U-UMLS:C0...","['<START>', 'U-T033', 'O', 'U-T038', 'O', 'O',..."


In [4]:
val_sequences = [eval(seq) for seq in val_data['Full Text']]
val_label_code = [eval(label) for label in val_data['Entity Codes']]
test_sequences = [eval(seq) for seq in test_data['Full Text']]
test_label_code = [eval(label) for label in test_data['Entity Codes']]
train_sequences = [eval(seq) for seq in train_data['Full Text']]
train_label_code = [eval(label) for label in train_data['Entity Codes']]

In [5]:
def clean_data():
    for i in (val_sequences, val_label_code, test_sequences, test_label_code, train_sequences, train_label_code):
        for j in i:
            while '<START>' in j:
                j.remove('<START>')
            while '<END>' in j:
                j.remove('<END>')
    
    labels = (test_label_code, train_label_code, val_label_code)
    test_label_binary = []
    val_label_binary = []
    train_label_binary = []
    
    for ent_list in test_label_code:
        ent_label = []
        for ent in ent_list:
            if ent == "O":
                ent_label.append("Not-Entity")
            else:
                ent_label.append("Entity")
        test_label_binary.append(ent_label)
        
    for ent_list in train_label_code:
        ent_label = []
        for ent in ent_list:
            if ent == "O":
                ent_label.append("Not-Entity")
            else:
                ent_label.append("Entity")
        train_label_binary.append(ent_label)
        
    for ent_list in val_label_code:
        ent_label = []
        for ent in ent_list:
            if ent == "O":
                ent_label.append("Not-Entity")
            else:
                ent_label.append("Entity")
        val_label_binary.append(ent_label)
            
                   
                    
    return (val_sequences, 
            val_label_code,
            val_label_binary,
            test_sequences, 
            test_label_code,
            test_label_binary,
            train_sequences, 
            train_label_code,
            train_label_binary)

(val_sentences,
 val_labels,
 val_label_binary,
 test_sentences,
 test_labels,
 test_label_binary,
 train_sentences,
 train_labels,
 train_label_binary) = clean_data()

In [6]:
len(train_sentences), len(train_labels), len(train_label_binary)

(2635, 2635, 2635)

In [7]:
def unique_words():
    dict_ = {}
    for txt in [train_sentences, test_sentences, val_sentences]:
        for seq in txt:
            for word in np.unique(seq):
                if word.lower() not in dict_.keys():
                    dict_[word.lower()] = 1
                else:
                    dict_[word.lower()] += 1
                    
    return len(dict_)
            
num_tokens = unique_words()
num_tokens

42131

In [8]:
def encode_pad_data():
    text_tokenizer = text.Tokenizer(num_tokens+1, lower=True)
    label_tokenizer = text.Tokenizer(3)
    
    label_tokenizer.fit_on_texts(train_label_binary)
    
    text_tokenizer.fit_on_texts(train_sentences)
    """Train the tokenizer on the test and valdiation sequences, 
    otherwise, not all tokens will be tokized and will cause clashes"""
    text_tokenizer.fit_on_texts(val_sentences)
    text_tokenizer.fit_on_texts(test_sentences)
    
    encoded_train_sequences = text_tokenizer.texts_to_sequences(train_sentences)
    encoded_train_labels = label_tokenizer.texts_to_sequences(train_label_binary)     
    encoded_val_sequences = text_tokenizer.texts_to_sequences(val_sentences)
    encoded_val_labels = label_tokenizer.texts_to_sequences(val_label_binary)
    
    maxlen = max([max([len(seq) for seq in texts]) for texts in [encoded_train_sequences, encoded_val_sequences]])
    
    #encoded_train_labels_ohe = [to_categorical(i, num_classes=3) for i in encoded_train_labels]
    #encoded_val_labels_ohe = [to_categorical(i, num_classes=3) for i in encoded_val_labels]
    
    train_sentences_ = sequence.pad_sequences(encoded_train_sequences, dtype='int32', maxlen=maxlen, padding='post')
    train_label_binary_ = sequence.pad_sequences(encoded_train_labels, maxlen=maxlen, dtype='int32', padding='post')
    val_sentences_ = sequence.pad_sequences(encoded_val_sequences, dtype='int32', maxlen=maxlen, padding='post') 
    val_label_binary_ = sequence.pad_sequences(encoded_val_labels, maxlen=maxlen, dtype='int32', padding='post')
    
    return (train_sentences_, 
            train_label_binary_, 
            val_sentences_, 
            val_label_binary_,
            text_tokenizer ,
            label_tokenizer, 
            maxlen)


(train_sentences, 
 train_label_binary, 
 val_sentences, 
 val_label_binary,
 text_tokenizer,
 label_tokenizer,
 maxlen) = encode_pad_data()

In [9]:
inv_label_map = {v: k for k, v in label_tokenizer.word_index.items()}
inv_text_map = {v: k for k, v in text_tokenizer.word_index.items()}
len(inv_text_map), len(inv_label_map)

(42131, 2)

## LSTM

In [10]:
import tensorflow as tf
from keras import backend as K
from sklearn.metrics import fbeta_score


def mask(m, q):
    # Assumes m is 2D
    mask = tf.math.reduce_any(tf.not_equal(m, q), axis=-1)
    #return tf.boolean_mask(m, mask)
    return mask


def recall(y_true, y_pred):
    pad = tf.constant([0 for i in range(3)], dtype=tf.float32)
    mask_ = mask(y_true, pad)
    masked_y_data = tf.boolean_mask(y_true, mask_)
    masked_y_pred = tf.boolean_mask(y_pred, mask_)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    pad = tf.constant([0 for i in range(3)], dtype=tf.float32)
    mask_ = mask(y_true, pad)
    masked_y_data = tf.boolean_mask(y_true, mask_)
    masked_y_pred = tf.boolean_mask(y_pred, mask_)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1(y_true, y_pred):
    precision_ = precision(y_true, y_pred)
    recall_ = recall(y_true, y_pred)
    return 2*((precision_*recall_)/(precision_+recall_+K.epsilon()))

In [16]:
output_dim = 50
tf.random.set_seed(42)
opt = Adam(0.005)

sequence_input = Input(shape=(maxlen,), dtype=tf.int32, name='sequence_input')
outputs = Embedding(input_dim=num_tokens+1, output_dim=output_dim, trainable=True, mask_zero=True)(sequence_input)
outputs = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat')(outputs)
outputs = LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(outputs)
outputs = (TimeDistributed(Dense(64, activation="relu")))(outputs)

outputs = Dense(len(inv_label_map)+1, activation="softmax")(outputs)

lstm_model = Model(inputs=sequence_input, outputs=outputs)
lstm_model.compile(loss = 'SparseCategoricalCrossentropy', optimizer=opt, metrics=[f1, precision, recall])
lstm_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequence_input (InputLayer)  [(None, 931)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 931, 50)           2106600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 931, 128)          58880     
_________________________________________________________________
lstm_3 (LSTM)                (None, 931, 64)           49408     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 931, 64)           4160      
_________________________________________________________________
dense_3 (Dense)              (None, 931, 3)            195       
Total params: 2,219,243
Trainable params: 2,219,243
Non-trainable params: 0
_________________________________________________

In [12]:
#lstm_model.fit(train_sentences, train_label_binary, epochs=10, batch_size=256, validation_data=(val_sentences, val_label_binary))

In [14]:
import warnings
warnings.filterwarnings('ignore')
"""This ignored warning because precision and recall give warnings
that not all the true labels are represented in the predictions"""

def exclude_from_f1(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    f1 = f1_score(ytrue, yhat, average='weighted')
    return f1

def exclude_from_precision(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    precision = precision_score(ytrue, yhat, average='weighted')
    return precision

def exclude_from_recall(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    recall = recall_score(ytrue, yhat, average='weighted')
    return recall

In [17]:
train_f1_epochs, train_precision_epochs, train_recall_epochs = [], [], []
val_f1_epochs, val_precision_epochs, val_recall_epochs = [], [], []
train_f1_no_other_epochs, train_precision_no_other_epochs, train_recall_no_other_epochs = [], [], []
val_f1_no_other_epochs, val_precision_no_other_epochs, val_recall_no_other_epochs = [], [], []

for epoch in range(1, 11):
    print('epoch ', epoch)
    for x, y in zip(train_sentences, train_label_binary):
        #weights = get_weights(y)
        lstm_model.train_on_batch(x, y)#, class_weight=weights)
    
    #train_f1, train_precision, train_recall = [], [], []
    #train_f1_no_other, train_precision_no_other, train_recall_no_other = [], [], []
    #for x, y in zip(train_umls_text, train_labels):
    #    y_pred = np.argmax(lstm_model.predict(x), axis=-1)
    #    train_f1.append(exclude_from_f1(y, y_pred, [0]))
    #    train_precision.append(exclude_from_precision(y, y_pred, [0]))
    #    train_recall.append(exclude_from_recall(y, y_pred, [0]))
    #    train_f1_no_other.append(exclude_from_f1(y, y_pred, [0, 1]))
    #    train_precision_no_other.append(exclude_from_precision(y, y_pred, [0, 1]))
    #    train_recall_no_other.append(exclude_from_recall(y, y_pred, [0, 1]))
        
    #train_f1_epochs.append(np.mean(train_f1))
    #train_precision_epochs.append(np.mean(train_precision))
    #train_recall_epochs.append(np.mean(train_recall))
    #train_f1_no_other_epochs.append(np.mean(train_f1_no_other))
    #train_precision_no_other_epochs.append(np.mean(train_precision_no_other))
    #train_recall_no_other_epochs.append(np.mean(train_recall_no_other))
    
    val_f1_no_other, val_precision_no_other, val_recall_no_other = [], [], []
    val_f1, val_precision, val_recall = [], [], []
    for x, y in zip(val_sentences[:100], val_label_binary[:100]):
        y_pred = np.argmax(lstm_model.predict(x), axis=-1)
        val_f1.append(exclude_from_f1(y, y_pred, [0]))
        val_precision.append(exclude_from_precision(y, y_pred, [0]))
        val_recall.append(exclude_from_recall(y, y_pred, [0]))
        #val_f1_no_other.append(exclude_from_f1(y, y_pred, [0, 1]))
        #val_precision_no_other.append(exclude_from_precision(y, y_pred, [0, 1]))
        #val_recall_no_other.append(exclude_from_recall(y, y_pred, [0, 1]))
    
    val_f1_epochs.append(np.mean(val_f1))
    val_precision_epochs.append(np.mean(val_precision))
    val_recall_epochs.append(np.mean(val_recall))
    #val_f1_no_other_epochs.append(np.mean(val_f1_no_other))
    #val_precision_no_other_epochs.append(np.mean(val_precision_no_other))
    #val_recall_no_other_epochs.append(np.mean(val_recall_no_other))
    
    print(np.mean(val_f1), np.mean(val_precision), np.mean(val_recall))
    #print(np.mean(val_f1_no_other), np.mean(val_precision_no_other), np.mean(val_recall_no_other))

epoch  1
0.8794472159250677 0.8860727752408853 0.8808353144878142
epoch  2
0.8833330980511453 0.8887630973053312 0.8827398520465066
epoch  3
0.8787083118824448 0.8839655820737804 0.8784250197844367
epoch  4
0.8473173852847014 0.8534477342432959 0.8515471957043772
epoch  5
0.845066885083785 0.8523725498221087 0.8498113175582425
epoch  6
0.8460912443341768 0.8534081124893018 0.8507334214119124
epoch  7
0.8492333735687141 0.8556932364622657 0.8530068968416187
epoch  8
0.8512451461165844 0.8578369468560006 0.8542829418107574
epoch  9
0.8553233282586942 0.8620052029987931 0.85707534396509
epoch  10
0.8642284714822611 0.8724355904315639 0.8663033572071578


### Test Data

In [18]:
test_sentences_ = text_tokenizer.texts_to_sequences(test_sentences)
test_label_binary_ = label_tokenizer.texts_to_sequences(test_label_binary)
test_sentences_ = sequence.pad_sequences(test_sentences_, dtype='int32', maxlen=maxlen, padding='post') 
test_label_binary_ = sequence.pad_sequences(test_label_binary_, maxlen=maxlen, dtype='int32', padding='post')

In [19]:
f1_scores = []
y_pred = []

for x, y in zip(test_sentences_, test_label_binary_):
    ypred = lstm_model.predict(x)
    ypred = np.argmax(ypred, axis=-1)
    y_pred.extend(ypred)
    f1_scores.append(exclude_from_f1(y, ypred, [0]))

print('f1 score on test data ', np.mean(f1_scores))

f1 score on test data  0.8637753918193617


In [20]:
y_pred = np.argmax(lstm_model.predict(test_sentences_[300]), axis=-1)
print("{0:35} {1:40} {2:40}".format('Extracted Entity', 'Actual Label', 'Predicted Label'))
print("{0:35} {1:40} {2:40}".format('________________', '____________', '_______________'))
for x, y, yhat in zip(test_sentences_[300], test_label_binary_[300], y_pred):
    if x != 0:
        #print('{0:>{numLength}}-{1:>{numLength}}: {2}'.format(lower[i], upper[i], '*' * num[i], numLength=digits))
        print("{0:35} {1:40} {2:40}".format(inv_text_map[x], inv_label_map[y], inv_label_map[yhat[0]]))

Extracted Entity                    Actual Label                             Predicted Label                         
________________                    ____________                             _______________                         
inducible                           not-entity                               not-entity                              
expression                          entity                                   entity                                  
of                                  not-entity                               not-entity                              
both                                not-entity                               not-entity                              
ermb                                entity                                   entity                                  
and                                 not-entity                               not-entity                              
ermt                                entity              