In [None]:
!pip install -U transformers datasets accelerate evaluate seqeval

In [3]:
import os
import sys 
import random
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Any

import torch
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from evaluate import load as load_metric
from datasets import load_from_disk


In [4]:

DATA_DIR = "../data"  
MODEL_NAME = "bert-base-cased"  
OUTPUT_DIR = "../outputs"
sys.path.append(os.path.abspath(".."))

#### load dataset

In [5]:

data = load_from_disk("../data/processed/bio_ner_bc5cdr")

In [6]:

label_list: List[str] = [l.strip() for l in open(os.path.join(DATA_DIR, "labels.txt"), encoding="utf-8")]
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

In [7]:

tokenized_datasets = load_from_disk("../data/processed/bio_ner_bc5cdr")


In [8]:

model = AutoModelForTokenClassification.from_pretrained(
                                                        MODEL_NAME, 
                                                         id2label=id2label,
                                                        label2id=label2id
                                                        )


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Data collator (dynamic padding for token classification)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [10]:
data_collator

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, r

In [34]:
seqeval = load_metric("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Remove ignored indices (-100) & convert to label strings
    true_labels, true_preds = [], []
    for pred, lab in zip(preds, labels):
        cur_true_labels, cur_true_preds = [], []
        for p, l in zip(pred, lab):
            if l == -100:
                continue
            cur_true_labels.append(id2label[l])
            cur_true_preds.append(id2label[p])
        true_labels.append(cur_true_labels)
        true_preds.append(cur_true_preds)

    results = seqeval.compute(predictions=true_preds, references=true_labels)
    
     # Aggregate main metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [39]:

args = TrainingArguments(
                        output_dir = OUTPUT_DIR  + "/biobert-ner-checkpoints",#finetuned ner 
                           eval_strategy="epoch",
                        save_strategy="epoch",
                        load_best_model_at_end=True,
                        metric_for_best_model="f1",
                        greater_is_better=True,#indicate for higher f1   
                        learning_rate=2e-5,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        num_train_epochs=5,
                        weight_decay=0.01,
                        fp16=torch.cuda.is_available(),
                        report_to="none", # disable MLflow logging
                        seed=42,
                         )

In [40]:
# from utils.ner_metrics import compute_metrics

trainer = Trainer(
    model=model,
    args= args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    # tokenizer=tokenizer,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # stop if val doesn't improve
)

In [23]:
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4648
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4657
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4861
    })
})

In [41]:
%%time
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.114875,0.775105,0.781755,0.778416,0.959687
2,0.121600,0.12163,0.79094,0.823077,0.806689,0.960597
3,0.121600,0.123983,0.834034,0.841026,0.837515,0.966783
4,0.042400,0.140051,0.826364,0.857495,0.841642,0.966978
5,0.042400,0.144164,0.835845,0.858679,0.847108,0.967433




CPU times: total: 17h 40min 19s
Wall time: 11h 40min 41s


TrainOutput(global_step=1455, training_loss=0.06294214766459777, metrics={'train_runtime': 42040.4194, 'train_samples_per_second': 0.553, 'train_steps_per_second': 0.035, 'total_flos': 965983150460640.0, 'train_loss': 0.06294214766459777, 'epoch': 5.0})

In [48]:
# Save everything
trainer.save_model(OUTPUT_DIR + "/models/biobert_ner_baseline_v1")

In [45]:
%%time
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
# print("Test:", test_results)

CPU times: total: 38min 29s
Wall time: 7min 26s


In [46]:
test_metrics

{'eval_loss': 0.1530819982290268,
 'eval_precision': 0.829612330686595,
 'eval_recall': 0.8598954298993029,
 'eval_f1': 0.8444824799125183,
 'eval_accuracy': 0.9656873413130214,
 'eval_runtime': 446.0397,
 'eval_samples_per_second': 10.898,
 'eval_steps_per_second': 0.682,
 'epoch': 5.0}