In [1]:
from __future__ import division, print_function, unicode_literals
import sys
from collections import defaultdict

from tqdm import tqdm
import numpy as np
import pandas as pd
import joblib

import transformers
from transformers import AutoModel, AutoTokenizer

import torch
import torch.nn as nn
import torch.utils.data
from torchcrf import CRF

from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import classification_report

In [2]:
data_folder = '/media/lurker18/Local Disk/BioNER-Abbrev/Dataset/NCBI'

In [3]:
model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"

pubmedbert = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False)
pubmedbert.save_pretrained(f"./pretrained_model/pubmedbert")
tokenizer.save_pretrained("./pretrained_model/pubmedbert")

[2023-12-05 14:01:51,092] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


('./pretrained_model/pubmedbert/tokenizer_config.json',
 './pretrained_model/pubmedbert/special_tokens_map.json',
 './pretrained_model/pubmedbert/vocab.txt',
 './pretrained_model/pubmedbert/added_tokens.json')

In [4]:
# Hyperparameter Settings
device = "cuda:0"
max_len = 256
train_batch_size = 64
valid_batch_size = 64
epochs = 30
num_workers = 8
BASE_MODEL_PATH = './pretrained_model/pubmedbert'
model_path = "models/NBCI_pubmedbert"
TOKENIZER = transformers.BertTokenizer.from_pretrained(BASE_MODEL_PATH,
                                                       do_lower_case = True)

In [5]:
class EntityDataset:
    def __init__(self, texts, tags,enc_tag):
        self.texts = texts
        self.tags = tags
        self.enc_tag = enc_tag
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = self.texts[item]
        tags = self.tags[item]

        ids = []
        target_tag =[]

        for i, s in enumerate(text):
            inputs = TOKENIZER.encode(
                str(s),
                add_special_tokens = False
            )
            input_len = len(inputs)
            ids.extend(inputs)
            target_tag.extend([tags[i]] * input_len)

        ids = ids[:max_len - 2]
        target_tag = target_tag[:max_len - 2]

        ids = [102] + ids + [103]
        o_tag = self.enc_tag.transform(["O"])[0]
        target_tag = [o_tag] + target_tag + [o_tag]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = max_len - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype = torch.long),
            "mask": torch.tensor(mask, dtype = torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

In [6]:

class EntityModel(nn.Module):
    def __init__(self, num_tag):
        super(EntityModel, self).__init__()
        self.num_tag = num_tag
        self.bert = transformers.BertModel.from_pretrained(BASE_MODEL_PATH, return_dict = False)
        self.bilstm =  nn.LSTM(768, 1024 // 2, num_layers = 1, bidirectional = True, batch_first = True)

        self.dropout_tag = nn.Dropout(0.3)
        
        self.hidden2tag_tag = nn.Linear(1024, self.num_tag)

        self.crf_tag = CRF(self.num_tag, batch_first = True)
    
    
    # return the loss only, not encode the tag
    def forward(self, ids, mask, token_type_ids, target_tag):
        x, _ = self.bert(ids, attention_mask = mask, token_type_ids = token_type_ids)
        h, _ = self.bilstm(x)

        o_tag = self.dropout_tag(h)
        tag = self.hidden2tag_tag(o_tag)
        mask = torch.where(mask == 1, True, False)

        loss_tag = - self.crf_tag(tag, 
                                  target_tag, 
                                  mask = mask, 
                                  reduction = 'token_mean')
        loss = loss_tag
        
        return loss


    # encode the tag, dont return loss
    def encode(self, ids, mask, token_type_ids, target_tag):
        # Bert - BiLSTM
        x, _ = self.bert(ids, 
                         attention_mask = mask, 
                         token_type_ids = token_type_ids)
        h, _ = self.bilstm(x)

        # drop out
        o_tag = self.dropout_tag(h)
        # o_pos = self.dropout_pos(h)

        # Hidden2Tag (Linear)
        tag = self.hidden2tag_tag(o_tag)
        mask = torch.where(mask == 1, True, False)
        tag = self.crf_tag.decode(tag, mask = mask)

        return tag

In [7]:
total_tags = []
with open(data_folder + "/classes.txt") as f:
    for line in f.readlines():
        total_tags.append(line.strip())
        

enc_tag = preprocessing.LabelEncoder()
enc_tag.fit(list(total_tags))

def process_data(data_path):
    sentences, tags = [], []
    sentence, tag = [], []
    
    total_tags = set()
    i = 0
    
    for path in data_path:
        with open(path, "r") as f:
            for line in f:
                if i % 10000 == 0:
                    print(len(sentences))
                
                i += 1
                line = line.strip()
                if line.startswith("-DOCSTART-"):
                    continue
                elif len(line) == 0:
                    if sentence == [] and tag == []:
                        continue
                        
                    sentences.append(sentence)
                    tags.append(tag)
                    sentence, tag = [], []
                else:
                    s,t = line.split("\t")
                    sentence.append(s)
                    tag.append(t)
                    
    for i in range(len(tags)):
        tags[i] = enc_tag.transform(tags[i])
        
    return sentences, tags, enc_tag

In [8]:
# Annotation Tagging 

def split_tag(chunk_tag):
    """
    split chunk tag into IOBES prefix and chunk_type
    e.g.
    B-PER --> (B, PER)
    O --> (0, None)
    """
    if chunk_tag == 'O':
        return ('O', None)
    return chunk_tag.split("-", maxsplit = 1)

def is_chunk_end(prev_tag, tag):
    """
    check if the previous chunk ended between the previous and current word
    e.g.
    (B-PER, I-PER) --> False
    (B-LOC, O) --> True
    Note: in case of contradicting tags, e.g. (B-PER, I-LOC)
    this is considered as (B-PER, B-LOC)
    """
    
    prefix1, chunk_type1 = split_tag(prev_tag)
    prefix2, chunk_type2 = split_tag(tag)
    
    if prefix1 == 'O':
        return False
    if prefix2 == 'O':
        return prefix1 != 'O'
    
    if chunk_type1 != chunk_type2:
        return True
    
    return prefix2 in ["B", "S"] or prefix1 in ["E", "S"]

def is_chunk_start(prev_tag, tag):
    """
    check if a new chunk started between the previous and current word
    """
    prefix1, chunk_type1 = split_tag(prev_tag)
    prefix2, chunk_type2 = split_tag(tag)
    
    if prefix2 == "O":
        return False
    if prefix1 == "O":
        return prefix2 != "O"
    
    if chunk_type1 != chunk_type2:
        return True
    
    return prefix2 in ["B", "S"] or prefix1 in ["E", "S"]

In [9]:
def calc_metrics(tp, p, t, percent = True):
    """
    compute overall precision, recall and F1-Score (default values are 0.0)
    if percent is True, return 100 * original decimal value
    """
    precision = tp / p if p else 0
    recall = tp / t if t else 0
    fb1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    if percent:
        return 100 * precision, 100 * recall, 100 * fb1
    else:
        return precision, recall, fb1

In [10]:
def count_chunks(true_seqs, pred_seqs):
    
    """
    true_seqs: a list of true tags
    pred_seqs: a list of predicted tags
    return: 
    correct_chunks: a dict (counter), 
                    key = chunk types, 
                    value = number of correctly identified chunks per type
    true_chunks:    a dict, number of true chunks per type
    pred_chunks:    a dict, number of identified chunks per type
    correct_counts, true_counts, pred_counts: similar to above, but for tags
    """
    
    correct_chunks = defaultdict(int)
    true_chunks = defaultdict(int)
    pred_chunks = defaultdict(int)

    correct_counts = defaultdict(int)
    true_counts = defaultdict(int)
    pred_counts = defaultdict(int)

    prev_true_tag, prev_pred_tag = 'O', 'O'
    correct_chunk = None

    for true_tag, pred_tag in zip(true_seqs, pred_seqs):
        if true_tag == pred_tag:
            correct_counts[true_tag] += 1
        true_counts[true_tag] += 1
        pred_counts[pred_tag] += 1

        _, true_type = split_tag(true_tag)
        _, pred_type = split_tag(pred_tag)

        if correct_chunk is not None:
            true_end = is_chunk_end(prev_true_tag, true_tag)
            pred_end = is_chunk_end(prev_pred_tag, pred_tag)

            if pred_end and true_end:
                correct_chunks[correct_chunk] += 1
                correct_chunk = None
            elif pred_end != true_end or true_type != pred_type:
                correct_chunk = None

        true_start = is_chunk_start(prev_true_tag, true_tag)
        pred_start = is_chunk_start(prev_pred_tag, pred_tag)

        if true_start and pred_start and true_type == pred_type:
            correct_chunk = true_type
        if true_start:
            true_chunks[true_type] += 1
        if pred_start:
            pred_chunks[pred_type] += 1

        prev_true_tag, prev_pred_tag = true_tag, pred_tag
    if correct_chunk is not None:
        correct_chunks[correct_chunk] += 1

    return (correct_chunks, true_chunks, pred_chunks, 
        correct_counts, true_counts, pred_counts)

In [11]:
# Print the results
def get_result(correct_chunks, true_chunks, pred_chunks,
    correct_counts, true_counts, pred_counts, verbose = True):
    
    """
    if verbose, print overall performance, as well as preformance per chunk type;
    otherwise, simply return overall prec, rec, f1 scores
    """
    
    # sum counts
    sum_correct_chunks = sum(correct_chunks.values())
    sum_true_chunks = sum(true_chunks.values())
    sum_pred_chunks = sum(pred_chunks.values())

    sum_correct_counts = sum(correct_counts.values())
    sum_true_counts = sum(true_counts.values())

    nonO_correct_counts = sum(v for k, v in correct_counts.items() if k != 'O')
    nonO_true_counts = sum(v for k, v in true_counts.items() if k != 'O')

    chunk_types = sorted(list(set(list(true_chunks) + list(pred_chunks))))

    # compute overall precision, recall and F1-Score (default values are 0.0)
    prec, rec, f1 = calc_metrics(sum_correct_chunks, sum_pred_chunks, sum_true_chunks)
    res = (prec, rec, f1)
    if not verbose:
        return res

    # print overall performance, and performance per chunk type
    
    print("processed %i tokens with %i phrases; " % (sum_true_counts, sum_true_chunks), end='')
    print("found: %i phrases; correct: %i.\n" % (sum_pred_chunks, sum_correct_chunks), end='')
        
    print("accuracy: %6.2f%%; (non-O)" % (100 * nonO_correct_counts/nonO_true_counts))
    print("accuracy: %6.2f%%; " % (100 * sum_correct_counts/sum_true_counts), end = '')
    print("precision: %6.2f%%; recall: %6.2f%%; F1-Score: %6.2f" % (prec, rec, f1))

    # for each chunk type, compute precision, recall and FB1 (default values are 0.0)
    for t in chunk_types:
        prec, rec, f1 = calc_metrics(correct_chunks[t], pred_chunks[t], true_chunks[t])
        print("%17s: " %t , end = '')
        print("precision: %6.2f%%; recall: %6.2f%%; F1-Score: %6.2f" % (prec, rec, f1), end = '')
        print("  %d" % pred_chunks[t])

    return res

In [12]:
# Evaluate the datasets
def evaluate(true_seqs, pred_seqs, verbose = True):
    (correct_chunks, true_chunks, pred_chunks, correct_counts, true_counts, pred_counts) = count_chunks(true_seqs, pred_seqs)
    result = get_result(correct_chunks, true_chunks, pred_chunks, correct_counts, true_counts, pred_counts, verbose = verbose)
    return result

In [13]:
# Set the train / validation / test set for each loss
def train_fn(data_loader, model, optimizer, device):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total = len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        loss = model(**data)
        loss.backward()
        optimizer.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)

def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    
    for data in tqdm(data_loader, total = len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)


def test_fn(dataset,model,device,enc_tag):
    final_test = []
    final_pred = []
    O = enc_tag.transform(["O"])[0]
  
    with torch.no_grad():
        for data in tqdm(dataset):
            for k, v in data.items():
                data[k] = v.to(device).unsqueeze(0)

            tag = model.encode(**data)
            padded_pred = tag[0]
            test = data["target_tag"].cpu()[0][:len(padded_pred)]
            test = enc_tag.inverse_transform(test)
            padded_pred = enc_tag.inverse_transform(padded_pred)
            final_pred.extend(padded_pred[1:-1])
            final_test.extend(test[1:-1])
  
    print(evaluate(final_test, final_pred))

In [14]:
# Load saved model
def load_model(epochs):
    path = model_path + f"_{epochs}.bin"
    device = torch.device(device)
    model = EntityModel(num_tag = num_tag)
    model.load_state_dict(torch.load(path))
    model.to(device)
    return model

In [15]:
# Load the NCBI Disease
if __name__ == "__main__":
    sentences, tag, enc_tag = process_data([data_folder + '/NCBI-disease-IOB/train.tsv',
                                            data_folder + '/NCBI-disease-IOB/dev.tsv'])
    test_sentences, test_tag, _ = process_data([data_folder + '/NCBI-disease-IOB/test.tsv'])

    meta_data = {
        "enc_tag": enc_tag
    }

    joblib.dump(meta_data, "meta.bin")

    num_tag = len(list(enc_tag.classes_))

    (
        train_sentences,
        valid_sentences,
        train_tag,
        valid_tag
    ) = model_selection.train_test_split(sentences, 
                                         tag, 
                                         random_state = 42, 
                                         test_size = 0.1)

    train_dataset = EntityDataset(texts = train_sentences, 
                                  tags = train_tag,
                                  enc_tag = enc_tag)

    train_data_loader = torch.utils.data.DataLoader(train_dataset, 
                                                    batch_size = train_batch_size, 
                                                    num_workers = num_workers)

    valid_dataset = EntityDataset(texts = valid_sentences, 
                                  tags = valid_tag,
                                  enc_tag = enc_tag)

    valid_data_loader = torch.utils.data.DataLoader(valid_dataset, 
                                                    batch_size = valid_batch_size, 
                                                    num_workers = num_workers)

    test_dataset = EntityDataset(texts = test_sentences,
                                 tags = test_tag, 
                                 enc_tag = enc_tag)

    device = torch.device(device)
    model = EntityModel(num_tag = num_tag)
    model.to(device)

    # optimizer = torch.optim.Adam(model.parameters(), lr=3e-5, weight_decay=0.001)
    optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)


0
385
766
1146
1532
1907
2300
2690
3083
3459
3839
4249
4631
5007
5383
5766
6126
0
369
734


In [16]:
# Run the model!
for epoch in range(epochs):
    train_loss = train_fn(train_data_loader, model, optimizer, device)
    torch.cuda.empty_cache()
    valid_loss = eval_fn(valid_data_loader, model, device)
    torch.cuda.empty_cache()
    print(f"Train Loss = {train_loss}")
    print(f"Validation loss = {valid_loss}")
    test_fn(test_dataset, model, device, enc_tag)
    torch.save(model.state_dict(), model_path + f"_{epoch}.bin")

100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.40it/s]


Train Loss = 0.19001987568206258
Validation loss = 0.10474997013807297


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 64.11it/s]


processed 26587 tokens with 1361 phrases; found: 1376 phrases; correct: 624.
accuracy:  74.32%; (non-O)
accuracy:  95.64%; precision:  45.35%; recall:  45.85%; F1-Score:  45.60
          Disease: precision:  45.35%; recall:  45.85%; F1-Score:  45.60  1376
(45.348837209302324, 45.8486407053637, 45.59736938253562)


100%|███████████████████████████████████████████| 90/90 [01:06<00:00,  1.34it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.08it/s]


Train Loss = 0.09815076076322132
Validation loss = 0.07882284633815288


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 64.26it/s]


processed 26587 tokens with 1361 phrases; found: 1455 phrases; correct: 737.
accuracy:  77.46%; (non-O)
accuracy:  96.32%; precision:  50.65%; recall:  54.15%; F1-Score:  52.34
          Disease: precision:  50.65%; recall:  54.15%; F1-Score:  52.34  1455
(50.65292096219931, 54.15135929463629, 52.34375)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.23it/s]


Train Loss = 0.06868906937953499
Validation loss = 0.06027600765228271


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 61.93it/s]


processed 26587 tokens with 1361 phrases; found: 1375 phrases; correct: 1030.
accuracy:  86.29%; (non-O)
accuracy:  97.53%; precision:  74.91%; recall:  75.68%; F1-Score:  75.29
          Disease: precision:  74.91%; recall:  75.68%; F1-Score:  75.29  1375
(74.90909090909092, 75.67964731814843, 75.29239766081874)


100%|███████████████████████████████████████████| 90/90 [01:07<00:00,  1.33it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.34it/s]


Train Loss = 0.055130862527423434
Validation loss = 0.056658771634101865


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 59.67it/s]


processed 26587 tokens with 1361 phrases; found: 1483 phrases; correct: 1188.
accuracy:  90.95%; (non-O)
accuracy:  97.84%; precision:  80.11%; recall:  87.29%; F1-Score:  83.54
          Disease: precision:  80.11%; recall:  87.29%; F1-Score:  83.54  1483
(80.10788941335132, 87.28875826598089, 83.54430379746836)


100%|███████████████████████████████████████████| 90/90 [01:06<00:00,  1.36it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.39it/s]


Train Loss = 0.04319271598425176
Validation loss = 0.053891933709383014


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 59.09it/s]


processed 26587 tokens with 1361 phrases; found: 1455 phrases; correct: 1198.
accuracy:  92.42%; (non-O)
accuracy:  97.87%; precision:  82.34%; recall:  88.02%; F1-Score:  85.09
          Disease: precision:  82.34%; recall:  88.02%; F1-Score:  85.09  1455
(82.33676975945016, 88.02351212343865, 85.08522727272727)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.40it/s]


Train Loss = 0.03621464369611608
Validation loss = 0.04939848203212023


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 60.98it/s]


processed 26587 tokens with 1361 phrases; found: 1424 phrases; correct: 1192.
accuracy:  92.01%; (non-O)
accuracy:  97.89%; precision:  83.71%; recall:  87.58%; F1-Score:  85.60
          Disease: precision:  83.71%; recall:  87.58%; F1-Score:  85.60  1424
(83.70786516853933, 87.582659808964, 85.60143626570915)


100%|███████████████████████████████████████████| 90/90 [01:04<00:00,  1.39it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.39it/s]


Train Loss = 0.028283464825815625
Validation loss = 0.05527025479823351


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 58.89it/s]


processed 26587 tokens with 1361 phrases; found: 1425 phrases; correct: 1201.
accuracy:  92.50%; (non-O)
accuracy:  97.83%; precision:  84.28%; recall:  88.24%; F1-Score:  86.22
          Disease: precision:  84.28%; recall:  88.24%; F1-Score:  86.22  1425
(84.28070175438597, 88.24393828067598, 86.2167982770998)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.20it/s]


Train Loss = 0.024908279115334154
Validation loss = 0.0483676765114069


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 59.41it/s]


processed 26587 tokens with 1361 phrases; found: 1413 phrases; correct: 1200.
accuracy:  92.84%; (non-O)
accuracy:  97.94%; precision:  84.93%; recall:  88.17%; F1-Score:  86.52
          Disease: precision:  84.93%; recall:  88.17%; F1-Score:  86.52  1413
(84.92569002123143, 88.1704628949302, 86.51766402307139)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.44it/s]


Train Loss = 0.025238811172958876
Validation loss = 0.045263377204537394


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 59.23it/s]


processed 26587 tokens with 1361 phrases; found: 1427 phrases; correct: 1218.
accuracy:  92.46%; (non-O)
accuracy:  98.08%; precision:  85.35%; recall:  89.49%; F1-Score:  87.37
          Disease: precision:  85.35%; recall:  89.49%; F1-Score:  87.37  1427
(85.35388927820603, 89.49301983835414, 87.3744619799139)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.36it/s]


Train Loss = 0.02073134052205003
Validation loss = 0.04592851623892784


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 60.24it/s]


processed 26587 tokens with 1361 phrases; found: 1461 phrases; correct: 1212.
accuracy:  92.01%; (non-O)
accuracy:  98.00%; precision:  82.96%; recall:  89.05%; F1-Score:  85.90
          Disease: precision:  82.96%; recall:  89.05%; F1-Score:  85.90  1461
(82.95687885010267, 89.0521675238795, 85.89652728561305)


100%|███████████████████████████████████████████| 90/90 [01:06<00:00,  1.34it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.21it/s]


Train Loss = 0.019818164475469125
Validation loss = 0.0387733044102788


100%|█████████████████████████████████████████| 939/939 [00:16<00:00, 57.60it/s]


processed 26587 tokens with 1361 phrases; found: 1378 phrases; correct: 1201.
accuracy:  90.42%; (non-O)
accuracy:  98.13%; precision:  87.16%; recall:  88.24%; F1-Score:  87.70
          Disease: precision:  87.16%; recall:  88.24%; F1-Score:  87.70  1378
(87.15529753265602, 88.24393828067598, 87.6962395034684)


100%|███████████████████████████████████████████| 90/90 [01:06<00:00,  1.35it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.14it/s]


Train Loss = 0.019562922190460894
Validation loss = 0.04130301997065544


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 65.50it/s]


processed 26587 tokens with 1361 phrases; found: 1377 phrases; correct: 1214.
accuracy:  91.33%; (non-O)
accuracy:  98.32%; precision:  88.16%; recall:  89.20%; F1-Score:  88.68
          Disease: precision:  88.16%; recall:  89.20%; F1-Score:  88.68  1377
(88.16267247639796, 89.19911829537105, 88.67786705624543)


100%|███████████████████████████████████████████| 90/90 [01:04<00:00,  1.39it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.39it/s]


Train Loss = 0.015764573723491694
Validation loss = 0.04363811798393726


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 64.17it/s]


processed 26587 tokens with 1361 phrases; found: 1377 phrases; correct: 1200.
accuracy:  90.08%; (non-O)
accuracy:  98.16%; precision:  87.15%; recall:  88.17%; F1-Score:  87.66
          Disease: precision:  87.15%; recall:  88.17%; F1-Score:  87.66  1377
(87.14596949891067, 88.1704628949302, 87.65522279035793)


100%|███████████████████████████████████████████| 90/90 [01:07<00:00,  1.33it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.13it/s]


Train Loss = 0.014478918859579911
Validation loss = 0.04906234480440617


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 58.85it/s]


processed 26587 tokens with 1361 phrases; found: 1290 phrases; correct: 1153.
accuracy:  87.73%; (non-O)
accuracy:  98.05%; precision:  89.38%; recall:  84.72%; F1-Score:  86.99
          Disease: precision:  89.38%; recall:  84.72%; F1-Score:  86.99  1290
(89.37984496124031, 84.71711976487877, 86.98604300264053)


100%|███████████████████████████████████████████| 90/90 [01:04<00:00,  1.39it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.32it/s]


Train Loss = 0.011730210212731941
Validation loss = 0.04483432993292809


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 65.05it/s]


processed 26587 tokens with 1361 phrases; found: 1340 phrases; correct: 1163.
accuracy:  89.58%; (non-O)
accuracy:  97.94%; precision:  86.79%; recall:  85.45%; F1-Score:  86.12
          Disease: precision:  86.79%; recall:  85.45%; F1-Score:  86.12  1340
(86.7910447761194, 85.45187362233652, 86.11625323954092)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.32it/s]


Train Loss = 0.011675250202339763
Validation loss = 0.04280924163758755


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 61.49it/s]


processed 26587 tokens with 1361 phrases; found: 1304 phrases; correct: 1148.
accuracy:  89.39%; (non-O)
accuracy:  97.95%; precision:  88.04%; recall:  84.35%; F1-Score:  86.15
          Disease: precision:  88.04%; recall:  84.35%; F1-Score:  86.15  1304
(88.03680981595092, 84.34974283614989, 86.15384615384616)


100%|███████████████████████████████████████████| 90/90 [01:07<00:00,  1.33it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.56it/s]


Train Loss = 0.010639642014737345
Validation loss = 0.041527945920825


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 66.51it/s]


processed 26587 tokens with 1361 phrases; found: 1311 phrases; correct: 1169.
accuracy:  88.64%; (non-O)
accuracy:  98.04%; precision:  89.17%; recall:  85.89%; F1-Score:  87.50
          Disease: precision:  89.17%; recall:  85.89%; F1-Score:  87.50  1311
(89.16857360793287, 85.89272593681116, 87.5)


100%|███████████████████████████████████████████| 90/90 [01:03<00:00,  1.41it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.18it/s]


Train Loss = 0.010424764916792305
Validation loss = 0.04567869510501623


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 59.64it/s]


processed 26587 tokens with 1361 phrases; found: 1327 phrases; correct: 1187.
accuracy:  90.11%; (non-O)
accuracy:  98.22%; precision:  89.45%; recall:  87.22%; F1-Score:  88.32
          Disease: precision:  89.45%; recall:  87.22%; F1-Score:  88.32  1327
(89.4498869630746, 87.21528288023512, 88.31845238095238)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.39it/s]


Train Loss = 0.008065022650407627
Validation loss = 0.04661700390279293


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 59.37it/s]


processed 26587 tokens with 1361 phrases; found: 1364 phrases; correct: 1205.
accuracy:  91.89%; (non-O)
accuracy:  98.21%; precision:  88.34%; recall:  88.54%; F1-Score:  88.44
          Disease: precision:  88.34%; recall:  88.54%; F1-Score:  88.44  1364
(88.34310850439883, 88.53783982365907, 88.44036697247707)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.42it/s]


Train Loss = 0.008772147884075012
Validation loss = 0.044757818710058926


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 59.81it/s]


processed 26587 tokens with 1361 phrases; found: 1335 phrases; correct: 1181.
accuracy:  88.07%; (non-O)
accuracy:  98.08%; precision:  88.46%; recall:  86.77%; F1-Score:  87.61
          Disease: precision:  88.46%; recall:  86.77%; F1-Score:  87.61  1335
(88.46441947565543, 86.77443056576047, 87.6112759643917)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.36it/s]


Train Loss = 0.008105124440044164
Validation loss = 0.05043441727757454


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 60.16it/s]


processed 26587 tokens with 1361 phrases; found: 1381 phrases; correct: 1198.
accuracy:  92.12%; (non-O)
accuracy:  98.09%; precision:  86.75%; recall:  88.02%; F1-Score:  87.38
          Disease: precision:  86.75%; recall:  88.02%; F1-Score:  87.38  1381
(86.74873280231716, 88.02351212343865, 87.38147337709701)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.42it/s]


Train Loss = 0.008563163820266102
Validation loss = 0.057709643989801405


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 58.83it/s]


processed 26587 tokens with 1361 phrases; found: 1402 phrases; correct: 1216.
accuracy:  93.71%; (non-O)
accuracy:  98.11%; precision:  86.73%; recall:  89.35%; F1-Score:  88.02
          Disease: precision:  86.73%; recall:  89.35%; F1-Score:  88.02  1402
(86.73323823109843, 89.3460690668626, 88.02026782482807)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.31it/s]


Train Loss = 0.006941575434757397
Validation loss = 0.053111820481717587


100%|█████████████████████████████████████████| 939/939 [00:16<00:00, 57.49it/s]


processed 26587 tokens with 1361 phrases; found: 1340 phrases; correct: 1190.
accuracy:  91.02%; (non-O)
accuracy:  98.10%; precision:  88.81%; recall:  87.44%; F1-Score:  88.12
          Disease: precision:  88.81%; recall:  87.44%; F1-Score:  88.12  1340
(88.80597014925374, 87.43570903747244, 88.11551277304702)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.37it/s]


Train Loss = 0.007455688589511233
Validation loss = 0.04933057855814695


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 66.41it/s]


processed 26587 tokens with 1361 phrases; found: 1338 phrases; correct: 1185.
accuracy:  91.06%; (non-O)
accuracy:  98.18%; precision:  88.57%; recall:  87.07%; F1-Score:  87.81
          Disease: precision:  88.57%; recall:  87.07%; F1-Score:  87.81  1338
(88.56502242152466, 87.06833210874358, 87.8103001111523)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.37it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.47it/s]


Train Loss = 0.00534852343520874
Validation loss = 0.06376697048544884


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 66.23it/s]


processed 26587 tokens with 1361 phrases; found: 1413 phrases; correct: 1223.
accuracy:  93.52%; (non-O)
accuracy:  98.12%; precision:  86.55%; recall:  89.86%; F1-Score:  88.18
          Disease: precision:  86.55%; recall:  89.86%; F1-Score:  88.18  1413
(86.55343241330502, 89.86039676708303, 88.17591925018024)


100%|███████████████████████████████████████████| 90/90 [01:09<00:00,  1.29it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.29it/s]


Train Loss = 0.007158220395083643
Validation loss = 0.058331127278506756


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 64.18it/s]


processed 26587 tokens with 1361 phrases; found: 1404 phrases; correct: 1225.
accuracy:  93.71%; (non-O)
accuracy:  98.16%; precision:  87.25%; recall:  90.01%; F1-Score:  88.61
          Disease: precision:  87.25%; recall:  90.01%; F1-Score:  88.61  1404
(87.25071225071225, 90.00734753857458, 88.60759493670886)


100%|███████████████████████████████████████████| 90/90 [01:05<00:00,  1.38it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.45it/s]


Train Loss = 0.004783049996735321
Validation loss = 0.06822614520788192


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 65.57it/s]


processed 26587 tokens with 1361 phrases; found: 1344 phrases; correct: 1159.
accuracy:  91.78%; (non-O)
accuracy:  97.93%; precision:  86.24%; recall:  85.16%; F1-Score:  85.69
          Disease: precision:  86.24%; recall:  85.16%; F1-Score:  85.69  1344
(86.23511904761905, 85.15797207935341, 85.6931608133087)


100%|███████████████████████████████████████████| 90/90 [01:04<00:00,  1.40it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.41it/s]


Train Loss = 0.004441009366160466
Validation loss = 0.06529605258256196


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 64.92it/s]


processed 26587 tokens with 1361 phrases; found: 1384 phrases; correct: 1215.
accuracy:  93.45%; (non-O)
accuracy:  98.22%; precision:  87.79%; recall:  89.27%; F1-Score:  88.52
          Disease: precision:  87.79%; recall:  89.27%; F1-Score:  88.52  1384
(87.78901734104046, 89.27259368111683, 88.52459016393443)


100%|███████████████████████████████████████████| 90/90 [01:06<00:00,  1.35it/s]
100%|███████████████████████████████████████████| 10/10 [00:03<00:00,  3.15it/s]


Train Loss = 0.004440833105602198
Validation loss = 0.05652193790301681


100%|█████████████████████████████████████████| 939/939 [00:15<00:00, 61.94it/s]


processed 26587 tokens with 1361 phrases; found: 1408 phrases; correct: 1239.
accuracy:  93.37%; (non-O)
accuracy:  98.32%; precision:  88.00%; recall:  91.04%; F1-Score:  89.49
          Disease: precision:  88.00%; recall:  91.04%; F1-Score:  89.49  1408
(87.9971590909091, 91.03600293901543, 89.4907908992416)


100%|███████████████████████████████████████████| 90/90 [01:04<00:00,  1.40it/s]
100%|███████████████████████████████████████████| 10/10 [00:02<00:00,  3.35it/s]


Train Loss = 0.004520943965892204
Validation loss = 0.060908088274300096


100%|█████████████████████████████████████████| 939/939 [00:14<00:00, 64.13it/s]


processed 26587 tokens with 1361 phrases; found: 1407 phrases; correct: 1229.
accuracy:  93.86%; (non-O)
accuracy:  98.21%; precision:  87.35%; recall:  90.30%; F1-Score:  88.80
          Disease: precision:  87.35%; recall:  90.30%; F1-Score:  88.80  1407
(87.34896943852168, 90.30124908155767, 88.80057803468208)
