In [None]:
!pip install -U transformers datasets accelerate evaluate seqeval

In [None]:
import os
import random
import sys
import numpy as np
from dataclasses import dataclass
from typing import List, Dict, Any

import torch
from datasets import Dataset, DatasetDict
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from evaluate import load as load_metric

In [None]:

DATA_DIR = "../data"  
MODEL_NAME = "dmis-lab/biobert-base-cased-v1.1"  
OUTPUT_DIR = "../outputs"
sys.path.append(os.path.abspath(".."))


#### load dataset

In [None]:
from datasets import load_from_disk

data = load_from_disk("../data/processed/bio_ner_bc5cdr")

In [None]:

label_list: List[str] = [l.strip() for l in open(os.path.join(DATA_DIR, "labels.txt"), encoding="utf-8")]
id2label = {i: l for i, l in enumerate(label_list)}
label2id = {l: i for i, l in enumerate(label_list)}

In [None]:
from datasets import load_from_disk

tokenized_datasets = load_from_disk("../data/processed/bio_ner_bc5cdr")


In [None]:

model = AutoModelForTokenClassification.from_pretrained(
                                                        MODEL_NAME, 
                                                         id2label=id2label,
                                                        label2id=label2id
                                                        )


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Data collator (dynamic padding for token classification)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
data_collator

DataCollatorForTokenClassification(tokenizer=BertTokenizerFast(name_or_path='dmis-lab/biobert-base-cased-v1.1', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_

#### Metrics

In [27]:
seqeval = load_metric("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # Remove ignored indices (-100) & convert to label strings
    true_labels, true_preds = [], []
    for pred, lab in zip(preds, labels):
        cur_true_labels, cur_true_preds = [], []
        for p, l in zip(pred, lab):
            if l == -100:
                continue
            cur_true_labels.append(id2label[l])
            cur_true_preds.append(id2label[p])
        true_labels.append(cur_true_labels)
        true_preds.append(cur_true_preds)

    results = seqeval.compute(predictions=true_preds, references=true_labels)
    
     # Aggregate main metrics
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [28]:


args = TrainingArguments(
                        output_dir = OUTPUT_DIR  + "/bert-ner-checkpoints",#finetuned ner

                        eval_strategy="epoch",
                        save_strategy="epoch",
                        load_best_model_at_end=True,
                        metric_for_best_model="f1",
                        greater_is_better=True,#indicate for higher f1   
                        learning_rate=2e-5,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        num_train_epochs=5,
                        weight_decay=0.01,
                        fp16=torch.cuda.is_available(),
                        report_to="none", # disable MLflow/W&B logging
                        seed=42,
                         )

In [29]:

trainer = Trainer(
    model=model,
    args= args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    # tokenizer=tokenizer,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # stop if val doesn't improve
)

In [None]:
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4648
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4657
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4861
    })
})

In [None]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.0967,0.830412,0.87357,0.851444,0.969178
2,0.043200,0.121874,0.837147,0.891223,0.863339,0.968235
3,0.043200,0.117825,0.879627,0.883531,0.881574,0.973479
4,0.013200,0.127589,0.870946,0.895168,0.882891,0.973728
5,0.013200,0.133697,0.874121,0.895069,0.884471,0.973955


In [17]:
# ckpt_path = OUTPUT_DIR + "/bert-ner-checkpoints/checkpoint-582"

# trainer.train(resume_from_checkpoint=ckpt_path)

In [None]:
# Save everything
trainer.save_model(OUTPUT_DIR + "/models/bert_ner_baseline_v1")

In [42]:
%%time
test_metrics = trainer.evaluate(eval_dataset=tokenized_datasets["test"])




CPU times: total: 55min 12s
Wall time: 9min 46s


In [None]:
print("Test:", test_results)