# Named Entity Recognition – RoBERTa (EN/VN)
**Objective/Mục tiêu**: Token classification on CoNLL-2003 with BIO labels.
**Inputs**: `datasets`, `transformers`, `evaluate`.
**Outputs**: F1, model checkpoint, label alignment demo.



# !pip install -q transformers datasets evaluate seqeval accelerate torch


In [None]:

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

dataset = load_dataset("conll2003")
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            new_labels.append(-100)
        elif word_idx != previous_word_idx:
            new_labels.append(labels[word_idx])
        else:
            # Inside subword: convert B- to I- when applicable
            label = labels[word_idx]
            if label % 2 == 1: # B-*
                label += 1     # I-*
            new_labels.append(label)
        previous_word_idx = word_idx
    return new_labels

def tokenize_and_align(batch):
    tokenized = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
    aligned = []
    for i, labels in enumerate(batch["ner_tags"]):
        word_ids = tokenized.word_ids(i)
        aligned.append(align_labels_with_tokens(labels, word_ids))
    tokenized["labels"] = aligned
    return tokenized

tokenized = dataset.map(tokenize_and_align, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

seqeval = evaluate.load("seqeval")
def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=2)
    true_labels = p.label_ids
    true_predictions = [
        [label_list[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, true_labels)
    ]
    true_labels_seq = [
        [label_list[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, true_labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels_seq)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"]}

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].select(range(5000)),
    eval_dataset=tokenized["validation"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
metrics = trainer.evaluate()
metrics



# Save artifacts
trainer.save_model("./model_roberta_conll2003")
tokenizer.save_pretrained("./model_roberta_conll2003")
with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
print("Saved model + metrics.")
