In [4]:
# 2b_train_ner_transformers.py
# Hu·∫•n luy·ªán NER v·ªõi PhoBERT (transformers)

from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorForTokenClassification
)
import torch
from datasets import Dataset
import numpy as np

In [None]:
# D·ªØ li·ªáu training
data = {
    "tokens": [
        ["T√¥i", "b·ªã", "s·ªët", "cao", "v√†", "ƒëau", "ƒë·∫ßu"],
        ["Em", "b·ªã", "ho", "c√≥", "ƒë·ªùm"],
        ["Con", "b·ªã", "ƒëau", "b·ª•ng", "v√†", "ti√™u", "ch·∫£y"],
        ["B·ªánh", "nh√¢n", "c√≥", "s·ªët", "v√†", "bu·ªìn", "n√¥n"],
    ],
    "ner_tags": [
        [0, 0, 1, 2, 0, 1, 2],  # O, O, B-SYM, I-SYM, O, B-SYM, I-SYM
        [0, 0, 1, 2, 2],
        [0, 0, 1, 2, 0, 1, 2],
        [0, 0, 0, 1, 0, 1, 2],
    ]
}

# Label mapping
label_list = ["O", "B-SYMPTOM", "I-SYMPTOM"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

In [10]:

# Load PhoBERT tokenizer
model_checkpoint = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# if not getattr(tokenizer, "is_fast", False):
#     raise ValueError("A fast tokenizer is required for label alignment (word_ids).")

# Tokenize v√† align labels (works with fast tokenizers and falls back to a
# manual alignment for slow tokenizers that don't implement `word_ids()`)
def tokenize_and_align_labels(examples):
    # If the tokenizer is fast, use the convenient word_ids() method
    if getattr(tokenizer, "is_fast", False):
        tokenized_inputs = tokenizer(
            examples["tokens"],
            truncation=True,
            is_split_into_words=True,
            padding=True
        )

        labels = []
        for i, label in enumerate(examples["ner_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            label_ids = []
            previous_word_idx = None

            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    # Slow tokenizer fallback: build inputs and label alignment manually
    # This works by tokenizing each word separately and assigning the label
    # to the first token of the word, -100 to following subword tokens.
    input_ids_batch = []
    attention_mask_batch = []
    labels_batch = []

    pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else (
        tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 0
    )

    for words, word_labels in zip(examples["tokens"], examples["ner_tags"]):
        ids = []
        lbls = []

        # add cls token if tokenizer has it
        if tokenizer.cls_token_id is not None:
            ids.append(tokenizer.cls_token_id)
            lbls.append(-100)

        for w_idx, word in enumerate(words):
            # tokenize the single word (no special tokens)
            word_pieces = tokenizer.tokenize(word)
            if len(word_pieces) == 0:
                # fallback: treat whole word as one token
                token_ids = [tokenizer.unk_token_id if tokenizer.unk_token_id is not None else 0]
            else:
                token_ids = tokenizer.convert_tokens_to_ids(word_pieces)

            ids.extend(token_ids)
            # first piece gets the label, others get -100
            lbls.append(word_labels[w_idx])
            if len(token_ids) > 1:
                lbls.extend([-100] * (len(token_ids) - 1))

        # add sep token if tokenizer has it
        if tokenizer.sep_token_id is not None:
            ids.append(tokenizer.sep_token_id)
            lbls.append(-100)

        input_ids_batch.append(ids)
        labels_batch.append(lbls)
        attention_mask_batch.append([1] * len(ids))

    # pad sequences to the same length
    max_len = max(len(x) for x in input_ids_batch)
    input_ids_padded = [ids + [pad_id] * (max_len - len(ids)) for ids in input_ids_batch]
    attention_padded = [mask + [0] * (max_len - len(mask)) for mask in attention_mask_batch]
    labels_padded = [lbl + [-100] * (max_len - len(lbl)) for lbl in labels_batch]

    return {"input_ids": input_ids_padded, "attention_mask": attention_padded, "labels": labels_padded}

# T·∫°o dataset
dataset = Dataset.from_dict(data)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./ner_phobert_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_steps=10,
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train
print("üöÄ B·∫Øt ƒë·∫ßu training PhoBERT-NER...")
trainer.train()

# Save model
model.save_pretrained("./ner_phobert_model")
tokenizer.save_pretrained("./ner_phobert_model")
print("‚úì ƒê√£ l∆∞u model!")

# Test
def predict_ner(text):
    """D·ª± ƒëo√°n NER cho c√¢u m·ªõi"""
    inputs = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    results = []
    for token, pred in zip(tokens, predictions[0]):
        if token not in ["<s>", "</s>", "<pad>"]:
            results.append((token, id2label[pred.item()]))
    
    return results

# Test
test_text = "T√¥i b·ªã s·ªët cao v√† ƒëau ƒë·∫ßu"
print(f"\nüìù Test: {test_text}")
print(predict_ner(test_text))

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 1090.92 examples/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'