In [2]:
import warnings

warnings.filterwarnings("ignore")

# Load dataset
The **CoNLL-2003** dataset is a widely used benchmark for **Named Entity Recognition (NER)** tasks. It contains English newswire text annotated with four types of named entities:

- `PER`: Person
- `LOC`: Location
- `ORG`: Organization
- `MISC`: Miscellaneous

### Key Information

- **Dataset Name**: `conll2003`
- **Task**: Named Entity Recognition (NER)
- **Language**: English
- **Annotations**: IOB2 format (Inside, Outside, Beginning)
- **Splits**: `train`, `validation`, `test`

In [4]:
from datasets import load_dataset

dataset = load_dataset("conll2003", trust_remote_code=True)

README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
dataset = dataset.map(lambda x: {'tokens': x['tokens'], 'ner_tags': x['ner_tags']})

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
label_list = dataset['train'].features['ner_tags'].feature.names
print(label_list)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [8]:
model_name = "bert-base-cased"
num_labels = len(label_list)

# Data preprocessing

For NER tag using HuggingFace transformers, it's important to note that the tokenizer split a word into subwords.
So despite the dataset already splits the sentences into tokens, the BERT tokenizer will further split the tokens into the subword tokens.

With this in mind, the preprocessing function not only has to tokenize the already split sentences, but also adding labels to the subwords. We choose a label adding strategy that assign the label of the first subword to the following subwords.

In [9]:
from transformers import BertTokenizerFast, BertForTokenClassification

tokenizer = BertTokenizerFast.from_pretrained(model_name)

2025-06-01 05:51:27.285430: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748757087.498800      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748757087.547426      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
def tokenize_and_align_labels(example):
    tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        return_attention_mask=True
    )
    word_ids = tokenized.word_ids()
    labels = []
    prev_word_id = None

    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        elif word_id != prev_word_id:
            labels.append(example["ner_tags"][word_id])
        else:
            labels.append(example["ner_tags"][word_id])
        prev_word_id = word_id

    # labels = [-100] * NUM_PROMPT_TOKENS + labels

    tokenized["labels"] = labels
    return tokenized


In [11]:
# tokenize_and_align_labels(True)(dataset['train'][0])
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

# Set up Training arguments and Trainer

We use a unified training arguments, compute metrics, and datacollator throughout this task for all following models.

In [12]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    label_names=['labels'],
    metric_for_best_model="eval_f1",
    load_best_model_at_end=True,
    report_to="tensorboard"
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

In [13]:
from evaluate import load
metric = load("seqeval")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "eval_precision": results["overall_precision"],
        "eval_recall": results["overall_recall"],
        "eval_f1": results["overall_f1"],
        "eval_accuracy": results.get("overall_accuracy", 0),
    }


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [14]:
from transformers import EarlyStoppingCallback

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Full fine-tuning

In [15]:
from peft import get_peft_model, TaskType
from transformers import BertForTokenClassification

In [16]:
model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
import numpy as np

def count_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    trainable_percentage = 100 * trainable_params / all_params

    print(f"trainable params: {trainable_params:,} || all params: {all_params:,} || trainable%: {trainable_percentage:.4f}")

count_trainable_parameters(model)

trainable params: 107,726,601 || all params: 107,726,601 || trainable%: 100.0000


In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1645,0.074557,0.904299,0.91728,0.910743,0.978351
2,0.0465,0.074877,0.939402,0.924906,0.932098,0.982325
3,0.025,0.06496,0.922427,0.930289,0.926341,0.982707
4,0.0167,0.069576,0.931881,0.940158,0.936001,0.984297
5,0.0103,0.08219,0.93622,0.938992,0.937604,0.984576
6,0.0061,0.078831,0.940253,0.940337,0.940295,0.98465
7,0.005,0.080769,0.937937,0.938274,0.938105,0.984679
8,0.0048,0.090136,0.927342,0.93325,0.930287,0.983487
9,0.0042,0.094806,0.935866,0.934775,0.93532,0.984194


TrainOutput(global_step=3951, training_loss=0.03145852205995305, metrics={'train_runtime': 1212.6468, 'train_samples_per_second': 347.364, 'train_steps_per_second': 10.861, 'total_flos': 3503482444206792.0, 'train_loss': 0.03145852205995305, 'epoch': 9.0})

In [19]:
result = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
model.save_pretrained("./ner-full-model");

eval_precision: 0.8964
eval_recall: 0.9013
eval_f1: 0.8988
eval_accuracy: 0.9706
eval_loss: 0.2075
eval_runtime: 10.6585
eval_samples_per_second: 323.9680
eval_steps_per_second: 10.1330
epoch: 9.0000


# Partial fine-tuning
Freeze base model, only train the classifier head

In [20]:
frozen_model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
for param in frozen_model.bert.parameters():
    param.requires_grad = False
count_trainable_parameters(frozen_model)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,921 || all params: 107,726,601 || trainable%: 0.0064


In [21]:
frozen_trainer = Trainer(
    model=frozen_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

frozen_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.2915,0.831477,0.203187,0.004576,0.00895,0.764143
2,0.7062,0.629037,0.494174,0.129374,0.205063,0.810635
3,0.5781,0.532651,0.600267,0.282523,0.384212,0.847104
4,0.5074,0.473596,0.62573,0.384443,0.47627,0.868723
5,0.4624,0.433676,0.643937,0.450745,0.530293,0.882454
6,0.4274,0.404367,0.654314,0.502153,0.568223,0.892977
7,0.4026,0.38239,0.660265,0.536695,0.592101,0.899747
8,0.3843,0.364502,0.665782,0.562982,0.610082,0.905089
9,0.3689,0.350487,0.673311,0.584873,0.625984,0.909681
10,0.3553,0.339322,0.679381,0.594473,0.634097,0.9118


TrainOutput(global_step=13170, training_loss=0.3865101355898082, metrics={'train_runtime': 2201.1062, 'train_samples_per_second': 191.372, 'train_steps_per_second': 5.983, 'total_flos': 1.1679116426563014e+16, 'train_loss': 0.3865101355898082, 'epoch': 30.0})

In [22]:
result = frozen_trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
frozen_model.save_pretrained("./ner-frozen-model");

eval_precision: 0.6662
eval_recall: 0.6470
eval_f1: 0.6565
eval_accuracy: 0.9170
eval_loss: 0.3116
eval_runtime: 9.8776
eval_samples_per_second: 349.5800
eval_steps_per_second: 10.9340
epoch: 30.0000


# PEFT-LoRA

In [23]:
from peft import get_peft_model, LoraConfig, TaskType

base_model = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

lora_config =  LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "key", "value"],
)

lora_model = get_peft_model(base_model, lora_config)
for param in lora_model.classifier.parameters():
    param.requires_grad = True
    
lora_model.print_trainable_parameters()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 456,210 || all params: 108,175,890 || trainable%: 0.4217


In [24]:
lora_trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

lora_trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.6873,0.287561,0.66237,0.673067,0.667675,0.925031
2,0.2433,0.190214,0.783863,0.767899,0.775799,0.948034
3,0.1789,0.145841,0.836903,0.820384,0.828561,0.959219
4,0.1502,0.127441,0.85713,0.842365,0.849683,0.96437
5,0.1313,0.113152,0.871804,0.865781,0.868782,0.968594
6,0.1183,0.103944,0.877752,0.879957,0.878853,0.970625
7,0.1099,0.097791,0.887642,0.887404,0.887523,0.972759
8,0.1043,0.094817,0.885084,0.885878,0.885481,0.972391
9,0.0968,0.089279,0.884164,0.899157,0.891597,0.973745
10,0.0914,0.08754,0.910238,0.899785,0.904981,0.975834


TrainOutput(global_step=10536, training_loss=0.12081455080094623, metrics={'train_runtime': 2609.154, 'train_samples_per_second': 161.443, 'train_steps_per_second': 5.048, 'total_flos': 9394031736101880.0, 'train_loss': 0.12081455080094623, 'epoch': 24.0})

In [25]:
result = lora_trainer.evaluate(eval_dataset=tokenized_dataset["test"])
for key, value in result.items():
    print(f"{key}: {value:.4f}")
lora_model.save_pretrained("./ner-lora-model");

eval_precision: 0.8783
eval_recall: 0.8794
eval_f1: 0.8788
eval_accuracy: 0.9660
eval_loss: 0.1634
eval_runtime: 11.4589
eval_samples_per_second: 301.3380
eval_steps_per_second: 9.4250
epoch: 24.0000


## Fine-tuning Strategies Comparison

| Strategy              | Trainable Params | Epochs | Train Runtime (s) | Eval Accuracy | Eval Precision | Eval Recall | Eval F1  | Eval Loss |
|-----------------------|------------------|--------|--------------------|---------------|----------------|-------------|----------|-----------|
| Full Fine-tuning      | 107,726,601      | 9      | 1212.65            | **0.9706**     | **0.8964**      | **0.9013**   | **0.8988** | 0.2075    |
| Classifier Only       | 6,921            | 30     | 2201.11            | 0.9170        | 0.6662         | 0.6470      | 0.6565   | 0.3116    |
| LoRA                  | 456,210          | 24     | 2609.15            | 0.9660        | 0.8783         | 0.8794      | 0.8788   | **0.1634** |

## Conclusion

- **Full fine-tuning** yields the best overall performance across all metrics (accuracy, precision, recall, F1), but requires updating all model weights (100% trainable parameters).
- **LoRA** offers a strong trade-off: with only **0.42%** of weights being trained, it achieves performance **very close to full fine-tuning**, while requiring less memory and being modular.
- **Classifier-only tuning** is lightweight (just **6.9k** parameters) and fast to train, but suffers in all evaluation metrics, especially F1 and precision.
