In [2]:
!pip install datasets evaluate seqeval transformers peft



In [3]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
!pip install -U datasets



# Load dataset
For this NER task, we will use NCBI-Disease dataset.

NCBI-Disease includes BIO tags for disease name entities, so there are only 3 classes: O, B-Disease, I-Disease

In [5]:
from datasets import load_dataset

dataset = load_dataset("conll2003", trust_remote_code=True)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [7]:
dataset = dataset.map(lambda x: {'tokens': x['tokens'], 'ner_tags': x['ner_tags']})

In [8]:
label_list = dataset['train'].features['ner_tags'].feature.names
print(label_list)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [9]:
model_name = "bert-base-cased"
num_labels = len(label_list)

# Data preprocessing

For NER tag using HuggingFace transformers, it's important to note that the tokenizer split a word into subwords.
So despite the dataset already splits the sentences into tokens, the BERT tokenizer will further split the tokens into the subword tokens.

With this in mind, the preprocessing function not only has to tokenize the already split sentences, but also adding labels to the subwords. We choose a label adding strategy that assign the label of the first subword to the following subwords.

In [10]:
from transformers import BertTokenizerFast, BertForTokenClassification

tokenizer = BertTokenizerFast.from_pretrained(model_name)

2025-06-01 05:02:30.628646: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748754150.652797      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748754150.659950      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [11]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        return_attention_mask=True
    )
    word_ids = tokenized.word_ids()
    labels = []
    prev_word_id = None

    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example["ner_tags"][word_id])
        else:
            labels.append(example["ner_tags"][word_id])
        prev_word_id = word_id

    # labels = [-100] * NUM_PROMPT_TOKENS + labels

    tokenized["labels"] = labels
    return tokenized


In [12]:
# tokenize_and_align_labels(True)(dataset['train'][0])
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

# Set up Training arguments and Trainer

We use a unified training arguments, compute metrics, and datacollator throughout this task for all following models.

In [13]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    label_names=['labels'],
    metric_for_best_model="eval_f1",
    load_best_model_at_end=True,
    report_to="tensorboard"
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [14]:
from evaluate import load
metric = load("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "eval_precision": results["overall_precision"],
        "eval_recall": results["overall_recall"],
        "eval_f1": results["overall_f1"],
        "eval_accuracy": results.get("overall_accuracy", 0),
    }


In [15]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Full finetune

In [16]:
from peft import get_peft_model, TaskType
from transformers import BertForTokenClassification

In [17]:
model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import numpy as np

def count_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    trainable_percentage = 100 * trainable_params / all_params

    print(f"trainable params: {trainable_params:,} || all params: {all_params:,} || trainable%: {trainable_percentage:.4f}")

count_trainable_parameters(model)

trainable params: 107,726,601 || all params: 107,726,601 || trainable%: 100.0000


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1628,0.070807,0.91266,0.919702,0.916168,0.979793
2,0.0472,0.070027,0.942201,0.927238,0.93466,0.982222
3,0.0261,0.067702,0.924367,0.92984,0.927095,0.982752
4,0.0174,0.074378,0.930693,0.936121,0.933399,0.983134
5,0.0126,0.079966,0.932217,0.943926,0.938035,0.984724
6,0.0076,0.099351,0.923413,0.928136,0.925768,0.981751


In [None]:
result = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
model.save_pretrained("./ner-full-model");

In [None]:
frozen_model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
for param in frozen_model.bert.parameters():
    param.requires_grad = False
count_trainable_parameters(frozen_model)

In [None]:
frozen_trainer = Trainer(
    model=frozen_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

frozen_trainer.train()

In [None]:
result = frozen_trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
frozen_model.save_pretrained("./ner-frozen-model");

# Finetune model: LoRA

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

base_model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

lora_config =  LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value"],
)

lora_model = get_peft_model(base_model, lora_config)
lora_model.print_trainable_parameters()

In [None]:
lora_trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

lora_trainer.train()

In [None]:
result = lora_trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
lora_model.save_pretrained("./ner-lora-model");

# Finetune model: Prefix tuning

In [None]:
from json import encoder
from peft import PrefixTuningConfig

base_model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

prefix_config = PrefixTuningConfig(
    task_type=TaskType.TOKEN_CLS,
    num_virtual_tokens=20,
    encoder_hidden_size=768
)

prefix_model = get_peft_model(base_model, prefix_config)
for name, param in prefix_model.named_parameters():
    if 'classifier' in name:
        param.requires_grad = True
prefix_model.print_trainable_parameters()

In [None]:
prefix_trainer = Trainer(
    model=prefix_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

prefix_trainer.train()

In [None]:
result = prefix_trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
prefix_model.save_pretrained("./ner-prefix-model");