In [2]:
import numpy as np
import json

import pandas as pd

import tensorflow as tf
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

from tensorflow.keras.layers import Input, Embedding, Bidirectional, Dense, LSTM, TimeDistributed, Lambda, SpatialDropout1D, Layer
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D

from tensorflow.keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop

from sklearn.metrics import f1_score, precision_score, recall_score

import warnings

In [3]:
def reformat_data(data_file):
    with open(data_file, 'r') as file:
        article_sentences, article_labels = [], []
        sentence_tokens, sentence_labels = [], []
        
        for line in file.readlines():
            if "-DOCSTART-" in line:
                if sentence_labels != []:
                    article_sentences.append(sentence_tokens)
                    article_labels.append(sentence_labels)
                    sentence_tokens, sentence_labels = [], []
            else:
                try:
                    token = line.split("\t")[0]
                    label = line.split("\t")[3][:-1]
                    sentence_tokens.append(token)
                    sentence_labels.append(label)
                except:
                    if sentence_labels != []:
                        article_sentences.append(sentence_tokens)
                        article_labels.append(sentence_labels)
                        sentence_tokens, sentence_labels = [], []
                
    return article_sentences, article_labels
    
train_sentences, train_detect_labels = reformat_data("../input/medlinker-data/mm_ner_ent.train.conll")
test_sentences, test_detect_labels = reformat_data("../input/medlinker-data/mm_ner_ent.test.conll")
dev_sentences, dev_detect_labels = reformat_data("../input/medlinker-data/mm_ner_ent.dev.conll")
_, train_recog_labels = reformat_data("../input/medlinker-data/mm_ner_sts.train.conll")
_, test_recog_labels = reformat_data("../input/medlinker-data/mm_ner_sts.test.conll")
_, dev_recog_labels = reformat_data("../input/medlinker-data/mm_ner_sts.dev.conll")

In [4]:
def unique_words():
    dict_ = {}
    lengths = []
    sent = []
    i = 0
    for txt in [train_sentences, test_sentences, dev_sentences]:
        for sentence in txt:
            lengths.append(len(sentence))
            sent.append(sentence)
            for word in np.unique(sentence):
                if word.lower() not in dict_.keys():
                    i+=1
                    dict_[word.lower()] = i
                    
    return dict_, np.max(lengths), sent
            
tokens_dict, maxlen, sent = unique_words()
maxlen

178

In [5]:
len(tokens_dict)

54563

In [6]:
label_dict = {}
i = 0
for sent_labels in train_detect_labels:
    for label in sent_labels:
        if label not in label_dict.keys():
            i+=1
            label_dict[label] = i 

In [7]:
label_dict['[PAD]'] = 0
label_dict

{'B-Entity': 1, 'O': 2, 'I-Entity': 3, '[PAD]': 0}

In [8]:
warnings.filterwarnings('ignore')
"""This ignored warning because precision and recall give warnings
that not all the true labels are represented in the predictions"""

def exclude_from_f1(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    f1 = f1_score(ytrue, yhat, average='micro')
    return f1

def exclude_from_precision(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    precision = precision_score(ytrue, yhat, average='micro')
    return precision

def exclude_from_recall(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    recall = recall_score(ytrue, yhat, average='micro')
    return recall



In [9]:
def mask(m, q):
    # Assumes m is 2D
    mask = tf.math.reduce_any(tf.not_equal(m, q), axis=-1)
    #return tf.boolean_mask(m, mask)
    return mask


def recall(y_true, y_pred):
    pad = tf.constant([0 for i in range(4)], dtype=tf.float32)
    mask_ = mask(y_true, pad)
    masked_y_data = tf.boolean_mask(y_true, mask_)
    masked_y_pred = tf.boolean_mask(y_pred, mask_)
    true_positives = K.sum(K.round(K.clip(masked_y_data * masked_y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(masked_y_data, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    pad = tf.constant([0 for i in range(4)], dtype=tf.float32)
    mask_ = mask(y_true, pad)
    masked_y_data = tf.boolean_mask(y_true, mask_)
    masked_y_pred = tf.boolean_mask(y_pred, mask_)
    true_positives = K.sum(K.round(K.clip(masked_y_data * masked_y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(masked_y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1(y_true, y_pred):
    precision_ = precision(y_true, y_pred)
    recall_ = recall(y_true, y_pred)
    return 2*((precision_*recall_)/(precision_+recall_+K.epsilon()))

In [10]:
#!pip install transformers==2.11.0
from transformers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Scibert Tokenizer

In [12]:
!wget "https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar"
!tar -xf scibert_scivocab_uncased.tar
scibert_tokenizer = BertTokenizer.from_pretrained('./scibert_scivocab_uncased/', do_lower_case=True)

--2021-03-26 23:02:51--  https://s3-us-west-2.amazonaws.com/ai2-s2-research/scibert/huggingface_pytorch/scibert_scivocab_uncased.tar
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.234.64
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.234.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 442460160 (422M) [application/x-tar]
Saving to: ‘scibert_scivocab_uncased.tar’


2021-03-26 23:03:01 (47.1 MB/s) - ‘scibert_scivocab_uncased.tar’ saved [442460160/442460160]



In [14]:
scibert_tokenizer

PreTrainedTokenizer(name_or_path='./scibert_scivocab_uncased/', vocab_size=31090, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [29]:
train_seq = sequence.pad_sequences(train_sentences, dtype=object, maxlen=maxlen, padding='post', value='[PAD]')
dev_seq = sequence.pad_sequences(dev_sentences, dtype=object, maxlen=maxlen, padding='post', value='[PAD]') 
test_seq = sequence.pad_sequences(test_sentences, dtype=object, maxlen=maxlen, padding='post', value='[PAD]') 
train_seq_tokenized = [scibert_tokenizer.convert_tokens_to_ids(s) for s in train_seq]
dev_seq_tokenized = [scibert_tokenizer.convert_tokens_to_ids(s) for s in dev_seq]
test_seq_tokenized = [scibert_tokenizer.convert_tokens_to_ids(s) for s in test_seq]

## Lebel tokenizer

In [15]:
train_labels, dev_labels = train_detect_labels, dev_detect_labels

for i, labels in enumerate(train_detect_labels):
    for j, label in enumerate(labels):
        train_labels[i][j] = label_dict[label]
for i, labels in enumerate(dev_detect_labels):
    for j, label in enumerate(labels):
        dev_labels[i][j] = label_dict[label]
        
train_labels_ohe = [to_categorical(i, num_classes=4) for i in train_labels]
dev_labels_ohe = [to_categorical(i, num_classes=4) for i in dev_labels]
        
train_labels = sequence.pad_sequences(train_labels_ohe, maxlen=maxlen, dtype='int32', padding='post')
dev_labels = sequence.pad_sequences(dev_labels_ohe, maxlen=maxlen, dtype='int32', padding='post')
train_labels = np.array(train_labels)
dev_labels = np.array(dev_labels)

## Build mask to ignore padded values

In [16]:
train_mask = [[1]*len(sent)+[0]*(maxlen - len(sent)) for sent in train_sentences]
train_mask = tf.cast(train_mask,tf.int32)
dev_mask = [[1]*len(sent)+[0]*(maxlen - len(sent)) for sent in dev_sentences]
dev_mask = tf.cast(dev_mask,tf.int32)
test_mask = [[1]*len(sent)+[0]*(maxlen - len(sent)) for sent in test_sentences]
test_mask = tf.cast(test_mask,tf.int32)

## Cast sequences into tensors

In [30]:
train_seq = train_seq_tokenized
train_seq = tf.cast(train_seq, tf.int32)
dev_seq = dev_seq_tokenized
dev_seq = tf.cast(dev_seq, tf.int32)
test_seq = test_seq_tokenized
test_seq = tf.cast(test_seq, tf.int32)
train_labels = tf.cast(train_labels, tf.int32)
dev_labels = tf.cast(dev_labels, tf.int32)

(TensorShape([27892, 178]),
 TensorShape([27892, 178]),
 TensorShape([27892, 178, 4]))

In [None]:
print(train_seq.shape)
print(train_mask.shape)
print(train_labels.shape)
print(dev_seq.shape)
print(dev_mask.shape)
print(dev_labels.shape)
print(test_seq.shape)
print(test_mask.shape)
print(test_labels.shape)

## Scibert LSTM

In [44]:
#reference: https://github.com/huggingface/transformers/issues/1350

tf.random.set_seed(42)
opt = Adam(0.0001)

config = BertConfig.from_json_file('./scibert_scivocab_uncased/config.json')
encoder = TFBertForTokenClassification.from_pretrained("./scibert_scivocab_uncased/", from_pt=True, name='scibert', config=config)
encoder.bert._trainable = False

input_ids = Input(shape=(maxlen,), dtype=tf.int32)
#token_type_ids = Input(shape=(maxlen,), dtype=tf.int32)
attention_mask = Input(shape=(maxlen,), dtype=tf.int32)
embedding = encoder(input_ids, attention_mask=attention_mask)[0]

outputs = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat')(embedding)

output = Dense(len(label_dict), activation='softmax', name='output')(outputs)

bert_lstm_model = Model([input_ids, attention_mask], output)

bert_lstm_model.compile(loss = 'CategoricalCrossentropy', optimizer=opt, metrics=[f1, precision, recall])

bert_lstm_model.summary()

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 178)]        0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 178)]        0                                            
__________________________________________________________________________________________________
scibert (TFBertForTokenClassifi TFTokenClassifierOut 109329410   input_17[0][0]                   
                                                                 input_18[0][0]                   
__________________________________________________________________________________________________
bidirectional_8 (Bidirectional) (None, 178, 128)     34304       scibert[0][0]              

In [45]:
his = bert_lstm_model.fit((train_seq, sentences_mask), train_labels, epochs=2, batch_size=1024)

The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 1/2


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




KeyboardInterrupt: 

In [21]:
inv_label_map = {v: k for k, v in label_dict.items()}
inv_text_map = {v: k for k, v in tokens_dict.items()}

y_pred = np.argmax(bert_lstm_model.predict(train_seq[1]), axis=-1)
print("{0:35} {1:40} {2:40}".format('Extracted Entity', 'Actual Label', 'Predicted Label'))
print("{0:35} {1:40} {2:40}".format('________________', '____________', '_______________'))
for x, y, yhat in zip(train_seq[1], train_labels[1], y_pred):
    if x != 0:
        #print('{0:>{numLength}}-{1:>{numLength}}: {2}'.format(lower[i], upper[i], '*' * num[i], numLength=digits))
        print("{0:35} {1:40} {2:40}".format(inv_text_map[x], inv_label_map[np.argmax(y)], inv_label_map[yhat[0]]))

ValueError: in user code:

    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1478 predict_function  *
        return step_function(self, iterator)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1468 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1461 run_step  **
        outputs = model.predict_step(data)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:1434 predict_step
        return self(x, training=False)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/engine/input_spec.py:207 assert_input_compatibility
        ' input tensors. Inputs received: ' + str(inputs))

    ValueError: Layer model expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'ExpandDims:0' shape=(None, 1) dtype=int32>]


In [None]:
embed_size = 50
tf.random.set_seed(42)
opt = Adam(0.001)

sequence_input = Input(shape=(None,), dtype=tf.int32, name='sequence_input')
outputs = Embedding(input_dim=len(tokens_dict)+1, output_dim=embed_size, trainable=True, mask_zero=True)(sequence_input)
#outputs = Masking(mask_value=special_value, input_shape=(None, 2))(outputs)
outputs = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat')(outputs)
#outputs = LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(outputs)
outputs = (TimeDistributed(Dense(64, activation="relu")))(outputs)

outputs = Dense(len(label_dict)+1, activation="softmax")(outputs)

lstm_model = Model(inputs=sequence_input, outputs=outputs)
lstm_model.compile(loss = 'CategoricalCrossentropy', optimizer=opt, metrics=[f1, precision, recall])
lstm_model.summary()

In [None]:
his = lstm_model.fit(train_seq, train_labels, epochs=2, batch_size=16, validation_data=(dev_seq, dev_labels))

In [None]:
def get_weights(y_classes):    
    total = len(y_classes)
    class_dict = {}
    for tag in y_classes:
        if tag not in class_dict.keys():
            class_dict[tag] = 1
        else:
            class_dict[tag] += 1

    class_weight = {}

    for key, value in class_dict.items():
        class_weight[key] = (1 / value * total / len(label_dict))
    
    for i in range(4):
        if i not in class_weight.keys():
            class_weight[i] = 0
    class_weight[0] = 0
    return class_weight

In [None]:
label_dict

In [None]:
inv_label_map = {v: k for k, v in label_dict.items()}
inv_text_map = {v: k for k, v in tokens_dict.items()}

In [None]:
y_pred = np.argmax(lstm_model.predict(dev_seq[1]), axis=-1)
print("{0:35} {1:40} {2:40}".format('Extracted Entity', 'Actual Label', 'Predicted Label'))
print("{0:35} {1:40} {2:40}".format('________________', '____________', '_______________'))
for x, y, yhat in zip(dev_seq[1], dev_labels[1], y_pred):
    if x != 0:
        #print('{0:>{numLength}}-{1:>{numLength}}: {2}'.format(lower[i], upper[i], '*' * num[i], numLength=digits))
        print("{0:35} {1:40} {2:40}".format(inv_text_map[x], inv_label_map[np.argmax(y)], inv_label_map[yhat[0]]))