In [None]:
!pip install seqeval

In [148]:
import numpy as np
import os
import json
from datetime import datetime
import shutil
import subprocess
import pandas as pd
import seqeval

In [135]:
def refomat_data(data_file):
    with open(data_file, 'r') as file:
        output_lines = []
        for line in file.readlines():
            if "-DOCSTART-" in line:
                continue
            if line != "\n":
                token = line.split("\t")[0]
                label = line.split("\t")[3][:-1]
                output_lines.append(token)
                output_lines.append("\t")
                output_lines.append(label)
                output_lines.append("\n")
            elif line == "\n":
                output_lines.append("\n")
    
    return output_lines[1:]
    
train_detect_lines = refomat_data("../input/medlinker-data/mm_ner_ent.train.conll")
test_detect_lines = refomat_data("../input/medlinker-data/mm_ner_ent.test.conll")
dev_detect_lines = refomat_data("../input/medlinker-data/mm_ner_ent.dev.conll")
train_recog_lines = refomat_data("../input/medlinker-data/mm_ner_sts.train.conll")
test_recog_lines = refomat_data("../input/medlinker-data/mm_ner_sts.test.conll")
dev_recog_lines = refomat_data("../input/medlinker-data/mm_ner_sts.dev.conll")

In [67]:
!mkdir ./train_data/

mkdir: cannot create directory ‘./train_data/’: File exists


In [136]:
with open('./train_data/train.txt', 'w') as f:
    for line in train_detect_lines:
        f.write("%s" % line)
with open('./train_data/dev.txt', 'w') as f:
    for line in dev_detect_lines:
        f.write("%s" % line)
with open('./train_data/test.txt', 'w') as f:
    for line in test_detect_lines:
        f.write("%s" % line)

In [69]:
!ls ./train_data/

dev.txt  test.txt  train.txt


In [137]:
with open('./train_data/train.txt', 'r') as f:
    train_lines = f.readlines()
with open('./train_data/dev.txt', 'r') as f:
    dev_lines = f.readlines()

In [138]:
train_lines[1:20]

['as\tO\n',
 'a\tO\n',
 'modifier\tO\n',
 'of\tO\n',
 'chronic\tB-Entity\n',
 'Pseudomonas\tI-Entity\n',
 'aeruginosa\tI-Entity\n',
 'infection\tI-Entity\n',
 'in\tO\n',
 'cystic\tO\n',
 'fibrosis\tO\n',
 '\n',
 'Pseudomonas\tB-Entity\n',
 'aeruginosa\tI-Entity\n',
 '(\tI-Entity\n',
 'Pa\tI-Entity\n',
 ')\tI-Entity\n',
 'infection\tI-Entity\n',
 'in\tO\n']

In [166]:
labels = {}

for line in train_lines:
    if line == '\n': 
        continue
    label = line.split('\t')[-1][:-1]
    assert label == 'O' or label.startswith('B-') or label.startswith('I-'), "label wrong! %s" % label
    if label not in labels: labels[label] = 1
    else: labels[label] += 1

In [165]:
labels

{'B-Entity': 115076, 'O': 523890, 'I-Entity': 57373}

In [167]:
with open("./train_data/label.txt", "w") as f:
    for label in reversed(sorted(labels)):
        f.write(label+"\n")

In [168]:
!ls ./train_data

dev.txt  label.txt  test.txt  train.txt


In [None]:
def unique_words():
    dict_ = {}
    lengths = []
    for txt in [train_sentences, test_sentences, dev_sentences]:
        for article in txt:
            lengths.append(max([len(sent) for sent in article]))
            for sentence in article:
                for word in np.unique(sentence):
                    if word.lower() not in dict_.keys():
                        dict_[word.lower()] = 1
                    else:
                        dict_[word.lower()] += 1
                    
    return len(dict_), max(lengths)
            
num_tokens, maxlen = unique_words()
num_tokens, maxlen

## Download and extract the pre-trained UmlsBERT model

In [None]:
!wget -O umlsbert.tar.xz https://www.dropbox.com/s/qaoq5gfen69xdcc/umlsbert.tar.xz?dl=0
!tar -xvf umlsbert.tar.xz
!rm umlsbert.tar.xz

In [146]:
def generate_command(config):
    command = "python3"
    command += " " + config["run_file"] + " "
    command += "--output_dir " + config["output_dir"] + " "
    command += "--model_name_or_path " + config["model_name_or_path"] + " "
    command += "--data_dir " + config["data_dir"] + " "
    command += "--num_train_epochs " + str(config["num_train_epochs"]) + " "
    command += "--per_device_train_batch_size " + str(config["per_device_train_batch_size"]) + " "
    command += "--learning_rate " + str(config["learning_rate"]) + " "
    command += "--max_seq_length " + str(config["max_seq_length"]) + " "


    if "do_train" in config:
        command += "--do_train "
    if "do_eval" in config:
        command += "--do_eval "
    if "do_predict" in config:
        command += "--do_predict "

    command += "--seed " + str(config["seed"]) + " "
    if "umls" in config:
        command += "--umls "
        command += "--med_document " + str(config["med_document"]) + " "

    command += "--labels " + config["labels"]
    command += " --save_steps 50000"

    return command

In [175]:
!mkdir ./results

In [176]:
config = {
    "run_file"                    :     "../input/umlsbert/run_ner.py",
    "labels"                      :     "./train_data/label.txt",
    "output_dir"                  :     "./results",
    "model_name_or_path"          :     "./umlsbert",
    "data_dir"                    :     "./train_data",
    "num_train_epochs"            :     5,
    "per_device_train_batch_size" :     32,
    "learning_rate"               :     1e-4,
    "max_seq_length"              :     80,
    "seed"                        :     42,
    "do_train"                    :     True,
    "do_eval"                     :     True,
    "umls"                        :     True,
    "med_document"                :     "../input/umlsbert/vocab_updated.txt",
    "do_predict"                  :     True
    }

# Run Downstream tasks with given config
command = generate_command(config)
subprocess.run(command, shell=True)

CompletedProcess(args='python3 ../input/umlsbert/run_ner.py --output_dir ./results --model_name_or_path ./umlsbert --data_dir ./train_data --num_train_epochs 5 --per_device_train_batch_size 32 --learning_rate 0.0001 --max_seq_length 80 --do_train --do_eval --do_predict --seed 42 --umls --med_document ../input/umlsbert/vocab_updated.txt --labels ./train_data/label.txt --save_steps 50000', returncode=1)

In [179]:
!python3 ../input/umlsbert/run_ner.py --output_dir ./results --model_name_or_path ./umlsbert --data_dir ./train_data --num_train_epochs 5 --per_device_train_batch_size 32 --learning_rate 0.0001 --max_seq_length 80 --do_train --do_eval --do_predict --seed 42 --umls --med_document ../input/umlsbert/vocab_updated.txt --labels ./train_data/label.txt --save_steps 50000

In [None]:
import numpy as np
import tensorflow as tf
from keras.preprocessing import text, sequence
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K

from tensorflow.keras.layers import Input, Embedding, Bidirectional, Dense, LSTM, TimeDistributed, Lambda, SpatialDropout1D
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, SpatialDropout1D

from tensorflow.keras.models import Model
from keras.optimizers import SGD, Adam, RMSprop

In [None]:

def encode_pad_data():
    text_tokenizer = text.Tokenizer(num_tokens+1, lower=True)
    label_tokenizer = text.Tokenizer(4)
    
    label_tokenizer.fit_on_texts(train_detection_labels)
    
    text_tokenizer.fit_on_texts(train_sentences)
    """Train the tokenizer on the test and valdiation sequences, 
    otherwise, not all tokens will be tokized and will cause clashes"""
    text_tokenizer.fit_on_texts(test_sentences)
    text_tokenizer.fit_on_texts(dev_sentences)
    
    encoded_train_sequences = text_tokenizer.texts_to_sequences(train_sentences)
    encoded_train_labels = label_tokenizer.texts_to_sequences(train_detection_labels)     
    encoded_dev_sequences = text_tokenizer.texts_to_sequences(dev_sentences)
    encoded_dev_labels = label_tokenizer.texts_to_sequences(dev_detection_labels)
        
    train_sentences_ = sequence.pad_sequences(encoded_train_sequences, dtype='int32', maxlen=maxlen, padding='post')
    train_labels = sequence.pad_sequences(encoded_train_labels, maxlen=maxlen, dtype='int32', padding='post')
    dev_sentences_ = sequence.pad_sequences(encoded_dev_sequences, dtype='int32', maxlen=maxlen, padding='post') 
    dev_labels = sequence.pad_sequences(encoded_dev_labels, maxlen=maxlen, dtype='int32', padding='post')
    
    return (train_sentences_, 
            train_labels, 
            dev_sentences_, 
            dev_labels,
            text_tokenizer ,
            label_tokenizer)

(train_sentences, 
 train_labels, 
 dev_sentences, 
 dev_labels,
 text_tokenizer,
 label_tokenizer) = encode_pad_data()

In [None]:
output_dim = 50
tf.random.set_seed(42)
opt = Adam(0.005)

sequence_input = Input(shape=(maxlen,), dtype=tf.int32, name='sequence_input')
outputs = Embedding(input_dim=num_tokens+1, output_dim=output_dim, trainable=True, mask_zero=True)(sequence_input)
outputs = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat')(outputs)
outputs = LSTM(units=64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(outputs)
outputs = (TimeDistributed(Dense(64, activation="relu")))(outputs)

outputs = Dense(4, activation="softmax")(outputs)

lstm_model = Model(inputs=sequence_input, outputs=outputs)
lstm_model.compile(loss = 'SparseCategoricalCrossentropy', optimizer=opt)
lstm_model.summary()

In [None]:
import warnings
from sklearn.metrics import f1_score, precision_score, recall_score

warnings.filterwarnings('ignore')
"""This ignored warning because precision and recall give warnings
that not all the true labels are represented in the predictions"""


def exclude_from_f1(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    f1 = f1_score(ytrue, yhat, average='weighted')
    return f1

def exclude_from_precision(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    precision = precision_score(ytrue, yhat, average='weighted')
    return precision

def exclude_from_recall(y_true, y_pred, excluded_tags=[0]):
    ytrue, yhat = [], []
    for y_t, y_p in zip(y_true, y_pred):
        if y_t not in excluded_tags:
            ytrue.append(y_t)
            yhat.append(y_p)
    recall = recall_score(ytrue, yhat, average='weighted')
    return recall

In [None]:
train_f1_epochs, train_precision_epochs, train_recall_epochs = [], [], []
val_f1_epochs, val_precision_epochs, val_recall_epochs = [], [], []
train_f1_no_other_epochs, train_precision_no_other_epochs, train_recall_no_other_epochs = [], [], []
val_f1_no_other_epochs, val_precision_no_other_epochs, val_recall_no_other_epochs = [], [], []

for epoch in range(1, 11):
    print('epoch ', epoch)
    for x, y in zip(train_sentences, train_labels):
        #weights = get_weights(y)
        lstm_model.train_on_batch(x, y)#, class_weight=weights)
    
    #train_f1, train_precision, train_recall = [], [], []
    #train_f1_no_other, train_precision_no_other, train_recall_no_other = [], [], []
    #for x, y in zip(train_umls_text, train_labels):
    #    y_pred = np.argmax(lstm_model.predict(x), axis=-1)
    #    train_f1.append(exclude_from_f1(y, y_pred, [0]))
    #    train_precision.append(exclude_from_precision(y, y_pred, [0]))
    #    train_recall.append(exclude_from_recall(y, y_pred, [0]))
    #    train_f1_no_other.append(exclude_from_f1(y, y_pred, [0, 1]))
    #    train_precision_no_other.append(exclude_from_precision(y, y_pred, [0, 1]))
    #    train_recall_no_other.append(exclude_from_recall(y, y_pred, [0, 1]))
        
    #train_f1_epochs.append(np.mean(train_f1))
    #train_precision_epochs.append(np.mean(train_precision))
    #train_recall_epochs.append(np.mean(train_recall))
    #train_f1_no_other_epochs.append(np.mean(train_f1_no_other))
    #train_precision_no_other_epochs.append(np.mean(train_precision_no_other))
    #train_recall_no_other_epochs.append(np.mean(train_recall_no_other))
    
    val_f1_no_other, val_precision_no_other, val_recall_no_other = [], [], []
    val_f1, val_precision, val_recall = [], [], []
    for x, y in zip(dev_sentences[:100], dev_labels[:100]):
        y_pred = np.argmax(lstm_model.predict(x), axis=-1)
        val_f1.append(exclude_from_f1(y, y_pred, [0]))
        val_precision.append(exclude_from_precision(y, y_pred, [0]))
        val_recall.append(exclude_from_recall(y, y_pred, [0]))
        #val_f1_no_other.append(exclude_from_f1(y, y_pred, [0, 1]))
        #val_precision_no_other.append(exclude_from_precision(y, y_pred, [0, 1]))
        #val_recall_no_other.append(exclude_from_recall(y, y_pred, [0, 1]))
    
    val_f1_epochs.append(np.mean(val_f1))
    val_precision_epochs.append(np.mean(val_precision))
    val_recall_epochs.append(np.mean(val_recall))
    #val_f1_no_other_epochs.append(np.mean(val_f1_no_other))
    #val_precision_no_other_epochs.append(np.mean(val_precision_no_other))
    #val_recall_no_other_epochs.append(np.mean(val_recall_no_other))
    
    print(np.mean(val_f1), np.mean(val_precision), np.mean(val_recall))
    #print(np.mean(val_f1_no_other), np.mean(val_precision_no_other), np.mean(val_recall_no_other))