In [1]:
from itertools import islice

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import csv 
import random

import tensorflow as tf
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

from tensorflow.keras.layers import Input, Embedding, Bidirectional, Dense, LSTM, TimeDistributed, Lambda, SpatialDropout1D
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D

from tensorflow.keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop

In [2]:
train_data = pd.read_csv("../input/thesis-clean-data/train.csv") 
test_data = pd.read_csv("../input/thesis-clean-data/test.csv")
val_data = pd.read_csv("../input/thesis-clean-data/val.csv")

In [3]:
train_data.head()

Unnamed: 0,Full Text,UMLS Code,Entity Codes
0,"['<START>', 'DCTN4', 'as', 'a', 'modifier', 'o...","['<START>', 'U-UMLS:C4308010', 'O', 'O', 'O', ...","['<START>', 'U-T103', 'O', 'O', 'O', 'O', 'B-T..."
1,"['<START>', 'Prevascularized', 'silicon', 'mem...","['<START>', 'O', 'U-UMLS:C0037114', 'O', 'O', ...","['<START>', 'O', 'U-T103', 'O', 'O', 'O', 'O',..."
2,"['<START>', 'Seated', 'maximum', 'flexion', ':...","['<START>', 'U-UMLS:C0277814', 'O', 'U-UMLS:C0...","['<START>', 'U-T033', 'O', 'U-T038', 'O', 'O',..."
3,"['<START>', 'The', 'Relationship', 'Between', ...","['<START>', 'O', 'O', 'O', 'O', 'O', 'B-UMLS:C...","['<START>', 'O', 'O', 'O', 'O', 'O', 'B-T033',..."
4,"['<START>', 'Promoting', 'lifestyle', 'behavio...","['<START>', 'O', 'O', 'O', 'O', 'O', 'B-UMLS:C...","['<START>', 'O', 'O', 'O', 'O', 'O', 'B-T033',..."


In [4]:
val_sequences = [eval(seq) for seq in val_data['Full Text']]
val_label_code = [eval(label) for label in val_data['Entity Codes']]
test_sequences = [eval(seq) for seq in test_data['Full Text']]
test_label_code = [eval(label) for label in test_data['Entity Codes']]
train_sequences = [eval(seq) for seq in train_data['Full Text']]
train_label_code = [eval(label) for label in train_data['Entity Codes']]

In [5]:
def clean_data():
    for i in (val_sequences, val_label_code, test_sequences, test_label_code, train_sequences, train_label_code):
        for j in i:
            while '<START>' in j:
                j.remove('<START>')
            while '<END>' in j:
                j.remove('<END>')
                
    return (val_sequences, 
            val_label_code,
            test_sequences, 
            test_label_code,
            train_sequences, 
            train_label_code)

(val_sentences,
 val_labels,
 test_sentences,
 test_labels,
 train_sentences,
 train_labels) = clean_data()

In [6]:
len(train_sentences), len(train_labels)

(2635, 2635)

In [7]:
def unique_words():
    word_dict = {}
    i = 0
    j = 0
    for txt in [train_sentences, test_sentences, val_sentences]:
        for seq in txt:
            for word in seq:
                if word.lower() not in word_dict.keys():
                    i+=1
                    word_dict[word.lower()] = i
                else:
                    word_dict[word.lower()] += 1
                    
    return word_dict


def unique_ents():
    dict_ = {}
    for txt in [train_labels, test_labels, val_labels]:
        for seq in txt:
            for ent in np.unique(seq):
                if ent.lower() not in dict_.keys():
                    dict_[ent.lower()] = 1
                else:
                    dict_[ent.lower()] += 1
                    
    return len(dict_)

num_tokens = len(unique_words())
num_tags = unique_ents()

print("Tags: ", num_tags)
print("Tokens: ", num_tokens)

Tags:  85
Tokens:  42131


In [8]:
def encode_pad_data():
    text_tokenizer = text.Tokenizer(num_tokens+1, lower=True)
    label_tokenizer = text.Tokenizer(num_tags+1)
    
    label_tokenizer.fit_on_texts(train_labels)
    
    text_tokenizer.fit_on_texts(train_sentences)
    """Train the tokenizer on the test and valdiation sequences, 
    otherwise, not all tokens will be tokized and will cause clashes"""
    text_tokenizer.fit_on_texts(val_sentences)
    text_tokenizer.fit_on_texts(test_sentences)
    
    encoded_train_sequences = text_tokenizer.texts_to_sequences(train_sentences)
    encoded_train_labels = label_tokenizer.texts_to_sequences(train_labels)     
    encoded_val_sequences = text_tokenizer.texts_to_sequences(val_sentences)
    encoded_val_labels = label_tokenizer.texts_to_sequences(val_labels)
    
    maxlen = max([max([len(seq) for seq in texts]) for texts in [encoded_train_sequences, encoded_val_sequences]])
        
    train_sentences_ = sequence.pad_sequences(encoded_train_sequences, dtype='int32', maxlen=maxlen, padding='post')
    train_labels_ = sequence.pad_sequences(encoded_train_labels, maxlen=maxlen, dtype='int32', padding='post')
    val_sentences_ = sequence.pad_sequences(encoded_val_sequences, dtype='int32', maxlen=maxlen, padding='post') 
    val_labels_ = sequence.pad_sequences(encoded_val_labels, maxlen=maxlen, dtype='int32', padding='post')
    
    return (train_sentences_, 
            train_labels_, 
            val_sentences_, 
            val_labels_,
            text_tokenizer ,
            label_tokenizer, 
            maxlen)


(train_sentences, 
 train_labels, 
 val_sentences, 
 val_labels,
 text_tokenizer,
 label_tokenizer,
 maxlen) = encode_pad_data()

In [9]:
inv_label_map = {v: k for k, v in label_tokenizer.word_index.items()}
inv_text_map = {v: k for k, v in text_tokenizer.word_index.items()}
len(inv_text_map), len(inv_label_map)

(42131, 85)

In [14]:
import warnings
warnings.filterwarnings('ignore')
"""This ignored warning because precision and recall give warnings
that not all the true labels are represented in the predictions"""

def exclude_from_f1(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    f1 = f1_score(ytrue, yhat, average='micro')
    return f1

def exclude_from_precision(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    precision = precision_score(ytrue, yhat, average='micro')
    return precision

def exclude_from_recall(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    recall = recall_score(ytrue, yhat, average='micro')
    return recall

In [28]:
output_dim = 50
tf.random.set_seed(42)
opt = Adam(0.001)

sequence_input = Input(shape=(maxlen,), dtype=tf.int32, name='sequence_input')
outputs = Embedding(input_dim=num_tokens+1, output_dim=output_dim, trainable=True, mask_zero=True)(sequence_input)
outputs = Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat')(outputs)
outputs = LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(outputs)

outputs = (TimeDistributed(Dense(output_dim, activation="relu")))(outputs)

outputs = Dense(num_tags+1, activation="softmax")(outputs)

lstm_model = Model(inputs=sequence_input, outputs=outputs)
lstm_model.compile(loss = 'SparseCategoricalCrossentropy', optimizer=opt, metrics=[tf.keras.metrics.SparseCategoricalCrossentropy()])
lstm_model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequence_input (InputLayer)  [(None, 931)]             0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 931, 50)           2106600   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 931, 100)          40400     
_________________________________________________________________
lstm_9 (LSTM)                (None, 931, 50)           30200     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 931, 50)           2550      
_________________________________________________________________
dense_9 (Dense)              (None, 931, 86)           4386      
Total params: 2,184,136
Trainable params: 2,184,136
Non-trainable params: 0
_________________________________________________

In [22]:
def get_weights(y_classes):    
    total = len(y_classes)
    class_dict = {}
    for tag in y_classes:
        if tag not in class_dict.keys():
            class_dict[tag] = 1
        else:
            class_dict[tag] += 1

    class_weight = {}

    for key, value in class_dict.items():
        class_weight[key] = (1 / value * total / num_tags)
    
    for i in range(85):
        if i not in class_weight.keys():
            class_weight[i] = 0
    class_weight[0] = 0
    return class_weight

In [None]:
for epoch in range(1, 11):
    print('epoch ', epoch)
    for x, y in zip(train_sentences, train_labels):
        weights = get_weights(y)
        lstm_model.train_on_batch(x, y, class_weight=weights)
    val_f1 = []
    for x, y in zip(val_sentences, val_labels):
        y_pred = np.argmax(lstm_model.predict(x), axis=-1)
        val_f1.append(exclude_from_f1(y, y_pred, [0]))
    
    print(np.mean(val_f1))

epoch  1


In [17]:
test_sentences = text_tokenizer.texts_to_sequences(test_sentences)
test_labels = label_tokenizer.texts_to_sequences(test_labels)
test_sentences = sequence.pad_sequences(test_sentences, dtype='int32', maxlen=maxlen, padding='post') 
test_labels = sequence.pad_sequences(test_labels, maxlen=maxlen, dtype='int32', padding='post')

In [30]:
y_pred = np.argmax(lstm_model.predict(test_sentences[400]), axis=-1)
print("{0:20} {1:15} {2:40}".format('Extracted Entity', 'Actual Label', 'Predicted Label'))
print("{0:20} {1:15} {2:40}".format('________________', '____', '____________', '_______________'))
for x, y, yhat in zip(test_sentences[400], test_labels[400], y_pred):
    if x != 0:
        print("{0:20} {1:15} {2:40}".format(inv_text_map[x], inv_label_map[y], inv_label_map[yhat[0]]))

Extracted Entity     Actual Label    Predicted Label                         
________________     ____            ____________                            
genome               b-t062          u-t017                                  
-                    i-t062          i-t103                                  
wide                 i-t062          o                                       
detection            l-t062          u-t058                                  
of                   o               o                                       
selective            b-t038          b-t033                                  
signatures           l-t038          u-t201                                  
in                   o               b-t082                                  
chicken              u-t204          u-t204                                  
through              o               o                                       
high                 b-t082          b-t033                     