In [1]:
from itertools import islice

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score

import tensorflow as tf
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K


from tensorflow.keras.layers import Input, Embedding, Bidirectional, Dense, LSTM, TimeDistributed, Lambda,Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D
from tensorflow.keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop


In [2]:
semgroups = "/kaggle/input/thesis/SemGroups_2018.txt"
semtypes = "/kaggle/input/thesis/SemanticTypes_2018AB.txt"
medmentions = "/kaggle/input/thesis/corpus_pubtator.txt"
umls_concept = "/kaggle/input/thesis/MRCONSO.RRF"
train_docs_ids = "../input/doc-ids/corpus_pubtator_pmids_trng.txt"
val_docs_ids = "../input/doc-ids/corpus_pubtator_pmids_dev.txt"
test_docs_ids = "../input/doc-ids/corpus_pubtator_pmids_test.txt"
umls_embedding_file = "../input/umls-embeddings/embeddings.csv"

In [3]:
def read_in_chunks(file_object, n=10000):
    """Lazy function (generator) to read a file piece by piece.
    Default chunk size: 1k."""
    while True:
        data = list(islice(file_object, n))
        if not data:
            break
        yield data

        
def umls_concepts():
    umls_concepts = {}
    with open(umls_concept) as f:
        for piece in read_in_chunks(f):
            for line in piece:
                if line != "":
                    line_list = line.split("|")
                    if line_list[1] == 'ENG':
                        umls_concepts[line_list[0]] = line_list[14]
    return umls_concepts


def umls_semtype():
    umls_semtype = {}
    with open(semgroups) as f:
        f = f.read().split("\n")
        for line in f:
            if line != "":
                line_list = line.split("|")
                umls_semtype[line_list[2]] = [line_list[1], line_list[3]]
    return umls_semtype

In [4]:
umls_concepts = umls_concepts()
umls_semtype = umls_semtype()

umls_semtype['T131']

['Chemicals & Drugs', 'Hazardous or Poisonous Substance']

In [5]:
len(umls_concepts), len(umls_semtype)

(4412440, 127)

## Build Data

In [6]:
!pip install git+https://github.com/ArshSekhon/pubtator_loader.git

from pubtator_loader import PubTatorCorpusReader
dataset_reader = PubTatorCorpusReader(medmentions)

corpus = dataset_reader.load_corpus() 
# corpus will be a List[PubtatorDocuments]

Collecting git+https://github.com/ArshSekhon/pubtator_loader.git
  Cloning https://github.com/ArshSekhon/pubtator_loader.git to /tmp/pip-req-build-c5m2otci
  Running command git clone -q https://github.com/ArshSekhon/pubtator_loader.git /tmp/pip-req-build-c5m2otci


In [7]:
def build_data():
    sequneces = []
    labels = []
    articles_ids = []
    for doc in corpus:
        row = []
        full_text = doc.get_space_separated_title_and_abstract()
        full_text_replaced = full_text
        for ent in doc.entities:
            start = ent.start_index
            end = ent.end_index
            umls_concept_code = ent.entity_id
            text_segment = ent.text_segment
            full_text_replaced = full_text_replaced.replace(text_segment, " "+umls_concept_code+" ", 1)
        full_text_list = full_text_replaced.split(" ")
        
        entities_list = [ent.entity_id for ent in doc.entities]
        articles_ids.append(doc.id)
        
        for word in full_text_list:
            if word != '':
                idx = -1
                indices = [i for i, x in enumerate(entities_list) if x == word]
                if word in entities_list:
                    idx = entities_list.index(word)   
                if idx > -1:
                    start = doc.entities[idx].start_index
                    end = doc.entities[idx].end_index
                    if "," in doc.entities[idx].semantic_type_id:
                        sem_ent = doc.entities[idx].semantic_type_id.split(",")[0]
                    else:
                        sem_ent = doc.entities[idx].semantic_type_id
                    row.append([word, sem_ent])
                else:
                    row.append([word, 'o'])
                    
        sequneces.append(list(np.array(row)[:,0]))
        labels.append(list(np.array(row)[:,1]))
    return sequneces, labels, articles_ids
    
sequences, labels, articles_ids = build_data()

In [8]:
assert sequences[0][0] == "C4308010"
assert labels[0][0] == "T116"
assert sequences[0][-5] == "C0854135"
assert labels[0][-5] == "T047"

assert sequences[5][1] == "C0870811"
assert labels[5][1] == "T054"
assert sequences[5][-2] == "C0243095"
assert labels[5][-2] == "T033"

## Data Visualization

In [9]:
def tokens_tags(data_X, data_y):
    vocab = []
    tags = []
    for seq in data_X:
        for word in np.unique(seq):
            if word.lower() not in vocab:
                vocab.append(word.lower())
    for seq_ent in data_y:
        for ent in seq_ent:
            if ent not in tags:
                tags.append(ent)
                
    return len(vocab), len(tags), max([len(s) for s in sequences])

num_tokens, num_tags, maxlen = tokens_tags(sequences, labels)
print('Number of unique tokens ', num_tokens)
print('Max length of sequence ', maxlen)
print('Number of unique tags ', num_tags)

Number of unique tokens  66317
Max length of sequence  685
Number of unique tags  127


In [43]:
def minority_classes(entities):
    class_count = {}
    for seq_tags in entities:
        for tag in seq_tags:
            if tag not in class_count.keys():
                class_count[tag] = 1
            else:
                class_count[tag] += 1
    
    threeshold = 10

    less_represented_classes = {}
    
    for k, v in class_count.items():
        if v < threeshold:
            less_represented_classes[k] = v

    return less_represented_classes

minority_classes(labels)            

{'T021': 2, 'T127': 2}

## Train Test Val split

In [10]:
def train_test_val_split():
    train_sequences = []
    test_sequences = []
    val_sequences = []
    train_labels = []
    test_labels = []
    val_labels = []
    
    with open(train_docs_ids) as train_ids:
        train_id_list = []
        train_ids = train_ids.read().split("\n")
        for line in train_ids:
            train_id_list.append(line)
    
    with open(val_docs_ids) as val_ids:
        val_id_list = []
        val_ids = val_ids.read().split("\n")
        for line in val_ids:
            val_id_list.append(line)

    with open(test_docs_ids) as test_ids:
        test_id_list = []
        test_ids = test_ids.read().split("\n")
        for line in test_ids:
            test_id_list.append(line)


    for x, y, idx in zip(sequences, labels, articles_ids):
        idx = str(idx)
        if idx in train_id_list:
            train_sequences.append(x)
            train_labels.append(y)
        elif idx in val_id_list:
            val_sequences.append(x)
            val_labels.append(y)
        elif idx in test_id_list:
            test_sequences.append(x)
            test_labels.append(y)
            
    return train_sequences, train_labels, val_sequences, val_labels, test_sequences, test_labels


train_sequences, train_labels, val_sequences, val_labels, test_sequences, test_labels = train_test_val_split()

In [45]:
len(train_sequences), len(train_labels), len(val_sequences), len(val_labels), len(test_sequences), len(test_labels)

(2635, 2635, 878, 878, 879, 879)

In [46]:
num_tokens_train, num_tags_train = tokens_tags(train_sequences, train_labels)[:2]
print('Number of unique tokens in train data ', num_tokens_train)
print('Number of unique tags in train data ', num_tags_train)
num_tokens_val, num_tags_val = tokens_tags(val_sequences, val_labels)[:2]
print('Number of unique tokens in val data ', num_tokens_val)
print('Number of unique tags in val data ', num_tags_val)
num_tokens_test, num_tags_test = tokens_tags(test_sequences, test_labels)[:2]
print('Number of unique tokens in test data ', num_tokens_test)
print('Number of unique tags in test data ', num_tags_test)

Number of unique tokens in train data  48187
Number of unique tags in train data  127
Number of unique tokens in val data  23181
Number of unique tags in val data  125
Number of unique tokens in test data  23252
Number of unique tags in test data  124


In [11]:
def encode_pad_data():
    tokenizer = text.Tokenizer(num_tokens+1, lower=True)
    label_tokenizer = text.Tokenizer(num_tags+1, lower=True)
    tokenizer.fit_on_texts(train_sequences)
    """Train the tokenizer on the test and valdiation sequences, 
    otherwise, not all tokens will be tokized and will cause clashes"""
    tokenizer.fit_on_texts(val_sequences)
    tokenizer.fit_on_texts(test_sequences) 
    label_tokenizer.fit_on_texts(train_labels)
    
    encoded_train_sequences = tokenizer.texts_to_sequences(train_sequences)
    encoded_train_labels = label_tokenizer.texts_to_sequences(train_labels)    
    encoded_test_sequences = tokenizer.texts_to_sequences(test_sequences)
    encoded_test_labels = label_tokenizer.texts_to_sequences(test_labels)  
    encoded_val_sequences = tokenizer.texts_to_sequences(val_sequences)
    encoded_val_labels = label_tokenizer.texts_to_sequences(val_labels)  
    
    #encoded_labels_ohe = [to_categorical(i, num_classes=num_tags+1) for i in encoded_labels]

    padded_train_sequences = sequence.pad_sequences(encoded_train_sequences, dtype='int32', maxlen=maxlen, padding='post')
    padded_train_labels = sequence.pad_sequences(encoded_train_labels, maxlen=maxlen, dtype='int32', padding='post')
    padded_test_sequences = sequence.pad_sequences(encoded_test_sequences, dtype='int32', maxlen=maxlen, padding='post')
    padded_test_labels = sequence.pad_sequences(encoded_test_labels, maxlen=maxlen, dtype='int32', padding='post')
    padded_val_sequences = sequence.pad_sequences(encoded_val_sequences, dtype='int32', maxlen=maxlen, padding='post') 
    padded_val_labels = sequence.pad_sequences(encoded_val_labels, maxlen=maxlen, dtype='int32', padding='post')
    
    return padded_train_sequences, padded_train_labels, padded_test_sequences, padded_test_labels , padded_val_sequences, padded_val_labels, tokenizer ,label_tokenizer

train_seqs, train_labels, test_seqs, test_labels, val_seqs, val_labels, tokenizer ,label_tokenizer = encode_pad_data()

In [20]:
inv_label_map = {v: k for k, v in label_tokenizer.word_index.items()}
inv_token_map = {v: k for k, v in tokenizer.word_index.items()}

In [21]:
len(inv_token_map)

66317

In [11]:
print(np.array(train_seqs).shape)
print(np.array(train_labels).shape)
print(np.array(test_seqs).shape)
print(np.array(test_labels).shape)
print(np.array(val_seqs).shape)
print(np.array(val_labels).shape)

(2635, 685)
(2635, 685)
(879, 685)
(879, 685)
(878, 685)
(878, 685)


### F1 score excluding certain entities like 'O' and "PAD"

In [12]:
def exclude_from_f1(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    f1 = f1_score(ytrue, yhat, average='weighted')
    return f1

## BiLSTM with attentions

In [24]:
def mask(m, q):
    # Assumes m is 2D
    mask = tf.math.reduce_any(tf.not_equal(m, q), axis=-1)
    #return tf.boolean_mask(m, mask)
    return mask

def recall(y_true, y_pred):
    pad = tf.constant([0 for i in range(num_tags+1)], dtype=tf.float32)
    mask_ = mask(y_true, pad)
    masked_y_data = tf.boolean_mask(y_true, mask_)
    masked_y_pred = tf.boolean_mask(y_pred, mask_)
    true_positives = K.sum(K.round(K.clip(masked_y_data * masked_y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(masked_y_data, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    pad = tf.constant([0 for i in range(num_tags+1)], dtype=tf.float32)
    mask_ = mask(y_true, pad)
    masked_y_data = tf.boolean_mask(y_true, mask_)
    masked_y_pred = tf.boolean_mask(y_pred, mask_)
    true_positives = K.sum(K.round(K.clip(masked_y_data * masked_y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(masked_y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    precision_ = precision(y_true, y_pred)
    recall_ = recall(y_true, y_pred)
    return 2*((precision_*recall_)/(precision_+recall_+K.epsilon()))

In [25]:
output_dim = 100
tf.random.set_seed(42)
opt = Adam(0.01)

sequence_input = Input(shape=(maxlen,), dtype=tf.int32, name='sequence_input')
sequence_mask = Lambda(lambda x: tf.greater(x, 0))(sequence_input)
outputs = Embedding(input_dim=num_tokens+1, output_dim=output_dim, trainable=True, mask_zero=True)(sequence_input)
outputs = Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat')(outputs)
outputs = (TimeDistributed(Dense(output_dim, activation="relu")))(outputs)

outputs = Dense(num_tags+1, activation="softmax")(outputs)

model = Model(inputs=sequence_input, outputs=outputs)
model.compile(loss = 'SparseCategoricalCrossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequence_input (InputLayer)  [(None, 685)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 685, 100)          6631800   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 685, 200)          160800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 685, 100)          20100     
_________________________________________________________________
dense_3 (Dense)              (None, 685, 128)          12928     
Total params: 6,825,628
Trainable params: 6,825,628
Non-trainable params: 0
_________________________________________________________________


In [13]:
def get_weights(y_classes):    
    total = len(y_classes)
    class_dict = {}
    for tag in y_classes:
        if tag not in class_dict.keys():
            class_dict[tag] = 1
        else:
            class_dict[tag] += 1

    class_weight = {}

    for key, value in class_dict.items():
        class_weight[key] = (1 / value * total / num_tags)
    
    for i in range(127):
        if i not in class_weight.keys():
            class_weight[i] = 0
    class_weight[0] = 0
    return class_weight

In [26]:
for epoch in range(20):
    print('epoch ', epoch)
    for x, y in zip(train_seqs, train_labels):
        weights = get_weights(y)
        model.train_on_batch(x, y, class_weight=weights)

    val_f1 = []
    for x, y in zip(val_seqs[:100], val_labels[:100]):
        y_pred = np.argmax(model.predict(x), axis=-1)
        val_f1.append(exclude_from_f1(y, y_pred, [0, 1]))
    
    print(np.mean(val_f1))
                #sample_weight=class_weights_arr)#, callbacks=[accuracy_reached])

epoch  0
0.775173251649654
epoch  1
0.8234216969425718
epoch  2
0.8335770025633539
epoch  3
0.8410674965074513
epoch  4
0.8464045860677775
epoch  5
0.8465307949579468
epoch  6
0.8490874167194016
epoch  7
0.8498744802113234
epoch  8
0.8500002116002235
epoch  9
0.8524950397519704
epoch  10
0.8515114480391359
epoch  11
0.8490540579284545
epoch  12
0.8522782980947268
epoch  13
0.8522411155052165
epoch  14
0.8492012639213445
epoch  15
0.8511595434411251
epoch  16
0.8502245950491596
epoch  17
0.8477115334304046
epoch  18
0.8473053002188435
epoch  19
0.8478006689706451


In [27]:
y_pred = np.argmax(model.predict(test_seqs[400]), axis=-1)
for x, yhat, y in zip(test_seqs[400],y_pred, test_labels[400]):
    if x != 0:
        print(inv_token_map[x], inv_label_map[y], inv_label_map[yhat[0]])

c2350277 t063 t063
of o o
c0017393 t045 t045
in o o
c0008051 t012 t012
through o o
c0752046 t086 t086
c0008051 t012 t012
is o o
recognized o o
as o o
an o o
excellent o o
c0026339 t075 t075
for o o
studies o o
of o o
c0314603 t169 t169
c0441712 t169 t169
of o o
c0031437 t032 t032
and o o
c0017428 t028 t028
c0015219 t045 t045
, o o
with o o
large o o
effective o o
c0032683 t081 t081
and o o
strong o o
c0086418 t016 t016
-driven o o
c1707391 t052 t052
. o o
in o o
the o o
present o o
study, o o
we o o
performed o o
c0545278 unknowntype t005
( o o
c0545278 unknowntype t005
) o o
tests o o
to o o
identify o o
significant o o
c0004793 t086 t086
employing o o
600k o t005
c0752046 t086 t086
c0008051 t012 t012
c3897601 t063 t005
in o o
an o o
c0441833 t078 t078
of o o
1,534 o t005
c0005595 t012 t012
, o o
which o o
was o o
derived o o
from o o
c0010364 t059 t005
between o o
c0008051 t012 t012
and o o
c0008051 t012 t012
. o o
results o o
indicated o o
that o o
a o o
total o o
of o o
49,151 o t0

In [28]:
f1_scores_no_O, f1_scores_with_O = [], []
y_pred = []

for x, y in zip(test_seqs, test_labels):
    ypred = model.predict(x)
    ypred = np.argmax(ypred, axis=-1)
    y_pred.extend(ypred)
    f1_scores_no_O.append(exclude_from_f1(y, ypred, [0, 1]))
    f1_scores_with_O.append(exclude_from_f1(y, ypred, [0]))

print('f1 score on test data including "Other" tag', np.mean(f1_scores_with_O))
print('f1 score on test data without "Other" tag', np.mean(f1_scores_no_O))

f1 score on test data including "Other" tag 0.9327400148537758
f1 score on test data without "Other" tag 0.8515278887145905


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

### LSTM with UMLS customed Embedding

In [14]:
def concept_dict():
    umls_embeddings = pd.read_csv(umls_embedding_file, header=None)
    umls_embeddings_arr = np.array(umls_embeddings)
    dict_ = {} 
    for concept in umls_embeddings_arr:
        dict_[concept[0]] = concept[1:]
    
    del umls_embeddings
    del umls_embeddings_arr
    
    return dict_

concepts = concept_dict()

here


In [15]:
def get_embeddings():
    embed_size=50

    word_index = tokenizer.word_index

    embedding_matrix = np.zeros((num_tokens+1, embed_size))

    for word, i in word_index.items():
        if i%1000 == 0:
            print('1000 words done')
        embedding_vector = concepts.get(word.upper()) #umls_embeddings.loc[umls_embeddings[0].values() == word.upper()]
        #embedding_vector = np.array(embedding_vector)[:, 1:]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix, embed_size

embedding_matrix, embed_size = get_embeddings()
del concepts

1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 words done
1000 wor

In [17]:
#from keras_self_attention import SeqSelfAttention

output_dim = 32
tf.random.set_seed(42)
opt = Adam(0.001)

sequence_input = Input(shape=(maxlen,), dtype=tf.int32, name='sequence_input')
sequence_mask = Lambda(lambda x: tf.greater(x, 0))(sequence_input)
outputs = Embedding(input_dim=num_tokens+1, output_dim=embed_size, weights=[embedding_matrix], trainable=True, mask_zero=True)(sequence_input)
outputs = Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat')(outputs)
outputs = (TimeDistributed(Dense(output_dim, activation="relu")))(outputs)

#outputs = Conv1D(64, kernel_size=3, padding='same', kernel_initializer='glorot_uniform')(outputs)

#avg_pool = GlobalAveragePooling1D()(outputs)
#max_pool = GlobalAveragePooling1D()(outputs)
#outputs = concatenate([avg_pool, max_pool])
outputs = Dense(num_tags+1, activation="softmax")(outputs)

model = Model(inputs=sequence_input, outputs=outputs)
model.compile(loss = 'SparseCategoricalCrossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequence_input (InputLayer)  [(None, 685)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 685, 50)           3315900   
_________________________________________________________________
bidirectional (Bidirectional (None, 685, 64)           21248     
_________________________________________________________________
time_distributed (TimeDistri (None, 685, 32)           2080      
_________________________________________________________________
dense_1 (Dense)              (None, 685, 128)          4224      
Total params: 3,343,452
Trainable params: 3,343,452
Non-trainable params: 0
_________________________________________________________________


In [18]:
for epoch in range(20):
    print('epoch ', epoch)
    for x, y in zip(train_seqs, train_labels):
        weights = get_weights(y)
        model.train_on_batch(x, y, class_weight=weights)

    val_f1 = []
    for x, y in zip(val_seqs[:100], val_labels[:100]):
        y_pred = np.argmax(model.predict(x), axis=-1)
        val_f1.append(exclude_from_f1(y, y_pred, [0, 1]))
    
    print(np.mean(val_f1))
                #sample_weight=class_weights_arr)#, callbacks=[accuracy_reached])

epoch  0
0.7181724454134879
epoch  1
0.8073648844690546
epoch  2
0.8276048702985144
epoch  3
0.8544529420642106
epoch  4
0.8590060742729123
epoch  5
0.8610952470975347
epoch  6
0.8614899307170539
epoch  7
0.8613326378830167
epoch  8
0.8587416430405108
epoch  9
0.8619065507288421
epoch  10
0.8622863866606998
epoch  11
0.861752374239942
epoch  12
0.8622863866606998
epoch  13
0.8622863866606998
epoch  14
0.8622354816833241
epoch  15
0.8621831874862932
epoch  16
0.862062506063685
epoch  17
0.8618390088295879
epoch  18
0.8622863866606998
epoch  19
0.8622863866606998


In [22]:
y_pred = np.argmax(model.predict(test_seqs[400]), axis=-1)
for x, yhat, y in zip(test_seqs[400],y_pred, test_labels[400]):
    if x != 0:
        print(inv_token_map[x], inv_label_map[y], inv_label_map[yhat[0]])

c2350277 t063 t063
of o o
c0017393 t045 t045
in o o
c0008051 t012 t012
through o o
c0752046 t086 t086
c0008051 t012 t012
is o o
recognized o o
as o o
an o o
excellent o o
c0026339 t075 t075
for o o
studies o o
of o o
c0314603 t169 t169
c0441712 t169 t169
of o o
c0031437 t032 t032
and o o
c0017428 t028 t028
c0015219 t045 t045
, o o
with o o
large o o
effective o o
c0032683 t081 t081
and o o
strong o o
c0086418 t016 t016
-driven o o
c1707391 t052 t052
. o o
in o o
the o o
present o o
study, o o
we o o
performed o o
c0545278 unknowntype t190
( o o
c0545278 unknowntype t190
) o o
tests o o
to o o
identify o o
significant o o
c0004793 t086 t086
employing o o
600k o t190
c0752046 t086 t086
c0008051 t012 t012
c3897601 t063 t190
in o o
an o o
c0441833 t078 t078
of o o
1,534 o t190
c0005595 t012 t012
, o o
which o o
was o o
derived o o
from o o
c0010364 t059 t190
between o o
c0008051 t012 t012
and o o
c0008051 t012 t012
. o o
results o o
indicated o o
that o o
a o o
total o o
of o o
49,151 o t1

In [23]:
f1_scores_no_O, f1_scores_with_O = [], []
y_pred = []

for x, y in zip(test_seqs, test_labels):
    ypred = model.predict(x)
    ypred = np.argmax(ypred, axis=-1)
    y_pred.extend(ypred)
    f1_scores_no_O.append(exclude_from_f1(y, ypred, [0, 1]))
    f1_scores_with_O.append(exclude_from_f1(y, ypred, [0]))

print('f1 score on test data including "Other" tag', np.mean(f1_scores_with_O))
print('f1 score on test data without "Other" tag', np.mean(f1_scores_no_O))

f1 score on test data including "Other" tag 0.9371161541520516
f1 score on test data without "Other" tag 0.8626894803513677
