In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re

In [35]:
from datasets import Dataset, DatasetDict

In [36]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    HfArgumentParser,
    PretrainedConfig,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [25]:
import shutil
shutil.copytree("/kaggle/input/ner-dataset-split", "/kaggle/working/ner-dataset-split")
path = '/kaggle/working/ner-dataset-split'

In [37]:
ds_test = Dataset.load_from_disk(path + "/ner_dataset_splits/test")
ds_val = Dataset.load_from_disk(path + "/ner_dataset_splits/validation")
ds_train = Dataset.load_from_disk(path + "/ner_dataset_splits/train")

In [38]:
ds_train

Dataset({
    features: ['tokens', 'labels', 'original_text', 'source'],
    num_rows: 84996
})

In [39]:
ds = DatasetDict({
    "train" : ds_train,
    "validation" : ds_val,
    "test" : ds_test
})

In [40]:
ds

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'original_text', 'source'],
        num_rows: 84996
    })
    validation: Dataset({
        features: ['tokens', 'labels', 'original_text', 'source'],
        num_rows: 10625
    })
    test: Dataset({
        features: ['tokens', 'labels', 'original_text', 'source'],
        num_rows: 10625
    })
})

In [41]:
from collections import Counter
def get_all_labels(ds):
    counter = Counter()
    for split in ds:
        counter.update(label for example in ds[split] for label in example["labels"])
    return sorted(counter.keys())

In [42]:
def sort_label_list_BIO(label_list):
  BI_pairs = []
  seen_i = set()
  for label in label_list:
    if label.startswith("B-"):
      entity = label[2:]
      i_label = f"I-{entity}"
      BI_pairs.append((label, i_label))
      seen_i.add(i_label)

  rest = [label for label in label_list if not label.startswith("B-") and label not in seen_i]

  sorted_list = []
  for b, i in sorted(BI_pairs):
    sorted_list.append(b)
    if i:
      sorted_list.append(i)
  sorted_list.extend(sorted(rest))

  return sorted_list

In [43]:
list_labels = get_all_labels(ds)
expanded_labels = sort_label_list_BIO(list_labels)

In [44]:
id2label = {i: label for i, label in enumerate(expanded_labels)}
label2id = {label: i for i, label in enumerate(expanded_labels)}

In [45]:
label2id

{'B-COMPANY': 0,
 'I-COMPANY': 1,
 'B-EVENT': 2,
 'I-EVENT': 3,
 'B-FIN_IND': 4,
 'I-FIN_IND': 5,
 'B-LAW': 6,
 'I-LAW': 7,
 'B-LOC': 8,
 'I-LOC': 9,
 'B-MBS': 10,
 'I-MBS': 11,
 'B-NUM': 12,
 'I-NUM': 13,
 'B-PERSON': 14,
 'I-PERSON': 15,
 'B-PRICE_ACTION': 16,
 'I-PRICE_ACTION': 17,
 'B-RISK': 18,
 'I-RISK': 19,
 'B-STOCK': 20,
 'I-STOCK': 21,
 'B-STRATEGY': 22,
 'I-STRATEGY': 23,
 'B-TIME': 24,
 'I-TIME': 25,
 'B-TITLE': 26,
 'I-TITLE': 27,
 'O': 28}

In [60]:
def add_labels_id(batch):
    batch["labels_id"] = [
        [label2id[label] for label in labels] for labels in batch["labels"]
    ]
    return batch
updated_ds = ds.map(add_labels_id, batched=True)

In [46]:
model_name = 'vinai/phobert-base-v2'

In [47]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [48]:
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=len(expanded_labels),
    id2label=id2label,
    label2id=label2id
)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    config=config
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
!pip install seqeval



In [50]:
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score

In [62]:
def shift_label(label_id, id2label):
  label_name = id2label[label_id]
  if label_name.startswith("B-"):
    return label_id + 1
  return label_id

def tokenize_and_align_labels_phobert(example):
  labels = example["labels_id"]
  tokens = example["tokens"]

  tokenized = tokenizer(
        example["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=256,        # cờ truncation
        return_overflowing_tokens=False
    )

  aligned_labels = []
  word_length = 0
  word_start = 0
  label = -100
  new_word = True

  for token in tokenized.input_ids:
    sub_word = tokenizer.convert_ids_to_tokens(token)

    if sub_word in ["<s>", "</s>"]:
      aligned_labels.append(-100)
      continue

    if word_start >= len(labels):
      continue

    # Đếm độ dài subword không tính dấu @@
    subword_clean = sub_word.replace("@@", "")
    length = len(subword_clean)

    # Một từ dài
    if sub_word.endswith("@@"):
      if not new_word: # token giữa
        aligned_labels.append(shift_label(labels[word_start], id2label))
      else: # token đầu
        aligned_labels.append(labels[word_start])
      word_length += length

    else:
      if word_start < len(labels):
        if not new_word: # token cuối
          aligned_labels.append(shift_label(labels[word_start], id2label))
        else: # từ đứng riêng
          aligned_labels.append(labels[word_start])

        word_length = word_length + length + 1
        word_start += word_length
        if word_start > len(labels)-1:
          word_start -= 1
        word_length = 0

    new_word = not sub_word.endswith("@@")

  tokenized["labels"] = aligned_labels
  return tokenized

# Áp dụng cho cả train và validation, theo batch để nhanh
tokenized_ds = updated_ds.map(
    tokenize_and_align_labels_phobert,
    batched=False,
    remove_columns=["tokens","original_text","source"]
)

In [63]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'labels_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 84996
    })
    validation: Dataset({
        features: ['labels', 'labels_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10625
    })
    test: Dataset({
        features: ['labels', 'labels_id', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10625
    })
})

In [51]:
from transformers import DataCollatorForTokenClassification
# tokenizer = AutoTokenizer.from_pretrained("vinai/")
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,          # tokenizer của phoBERT
    padding="longest",            # đệm đến câu dài nhất trong batch
    label_pad_token_id=-100       # token id mà Trainer sẽ IGNORE khi tính loss
)

In [55]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.

In [56]:
import evaluate
metric = evaluate.load("seqeval")
def compute_metrics_seqeval(p):
    predictions, labels = p
    # Get the index that has highest proportion from logits to predict
    predictions = np.argmax(predictions, axis=2)

    # convert ID to label name, eliminate token padding -100
    true_predictions = [
        [expanded_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [expanded_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script: 0.00B [00:00, ?B/s]

In [57]:
from transformers import Trainer
import torch
import torch.nn as nn

In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [74]:
import torch
from collections import Counter
import numpy as np

all_label_ids = [
    label_id
    for example in ds["train"]
    for label_id in example["labels"]
    if label_id != -100
]

# 3. Đếm số lần xuất hiện
counts = Counter(all_label_ids)
num_labels = len(expanded_labels)

# 4. Tính class weights theo công thức logarit (ổn định và "hiền hòa" hơn)
class_weights = []
# Lặp qua các ID từ 0 đến num_labels-1
for label in expanded_labels:
    count = counts[label]
    print(count)
    weight = 1.0 / np.log(count + 1.01) if count > 0 else 0.0
    class_weights.append(weight)

# 5. Chuẩn hóa và chuyển sang tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
class_weights_tensor = class_weights_tensor / class_weights_tensor.sum() * num_labels
class_weights_tensor = class_weights_tensor.to(device)

print("Class weights afer redefine:", dict(zip(expanded_labels, class_weights_tensor.cpu().numpy())))


17895
35100
1731
3182
4529
2131
8
0
5838
6946
3
0
46470
50050
2000
2841
3079
3407
302
156
53335
50920
1045
799
32396
73637
6
14
14151721
Class weights afer redefine: {'B-COMPANY': 0.6198445, 'I-COMPANY': 0.57994765, 'B-EVENT': 0.81395906, 'I-EVENT': 0.7525462, 'B-FIN_IND': 0.72099996, 'I-FIN_IND': 0.7918937, 'B-LAW': 2.7610548, 'I-LAW': 0.0, 'B-LOC': 0.69989645, 'I-LOC': 0.6861494, 'B-MBS': 4.3705077, 'I-MBS': 0.0, 'B-NUM': 0.5648049, 'I-NUM': 0.5609312, 'B-PERSON': 0.79849994, 'I-PERSON': 0.76326954, 'B-PRICE_ACTION': 0.75562793, 'I-PRICE_ACTION': 0.74622697, 'B-RISK': 1.0622982, 'I-RISK': 1.2004255, 'B-STOCK': 0.55765516, 'I-STOCK': 0.5600393, 'B-STRATEGY': 0.8729975, 'I-STRATEGY': 0.90801257, 'B-TIME': 0.584424, 'I-TIME': 0.5416051, 'B-TITLE': 3.116934, 'I-TITLE': 2.2408116, 'O': 0.36863622}


In [72]:
counts

Counter({'O': 14151721,
         'B-STOCK': 53335,
         'I-STOCK': 50920,
         'B-NUM': 46470,
         'B-TIME': 32396,
         'B-COMPANY': 17895,
         'I-COMPANY': 35100,
         'I-NUM': 50050,
         'I-TIME': 73637,
         'B-LOC': 5838,
         'I-LOC': 6946,
         'B-PERSON': 2000,
         'B-FIN_IND': 4529,
         'B-PRICE_ACTION': 3079,
         'I-PRICE_ACTION': 3407,
         'B-EVENT': 1731,
         'I-EVENT': 3182,
         'I-PERSON': 2841,
         'I-FIN_IND': 2131,
         'B-RISK': 302,
         'I-RISK': 156,
         'B-STRATEGY': 1045,
         'I-STRATEGY': 799,
         'B-LAW': 8,
         'B-MBS': 3,
         'B-TITLE': 6,
         'I-TITLE': 14})

In [80]:
class TokenClassificationTrainer(Trainer) :
  def compute_loss(self, model, inputs, return_outputs=False, **kwargs) :
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fct = nn.CrossEntropyLoss(
        weight = class_weights_tensor.to(logits.device),
        ignore_index = -100)
    loss = loss_fct(
            logits.view(-1, model.module.config.num_labels),
            labels.view(-1)
        )

    return (loss, outputs) if return_outputs else loss

In [76]:
from transformers import TrainingArguments

In [81]:
training_args = TrainingArguments(
    output_dir="./checkpoints",       # Thư mục lưu model và logs
    num_train_epochs=3,               # Số lần duyệt hết dữ liệu huấn luyện
    per_device_train_batch_size=16,   # Batch size cho mỗi GPU/CPU
    per_device_eval_batch_size=16,    # Batch size khi đánh giá (16 là đủ rồi)
    learning_rate=2e-5,               # Tốc độ học (LR)
    weight_decay=0.01,                # L2 regularization
    eval_strategy="epoch",            # Đánh giá sau mỗi epoch (hoặc "steps")
    logging_strategy="steps",         # Ghi log mỗi vài bước
    logging_steps=50,                 # Bao nhiêu bước ghi một lần
    load_best_model_at_end=True,      #cờ để lần sau train tiếp
    push_to_hub = True,               # đẩy lên hugging face
    hub_model_id="AnTrinh/my-phobert-ner",
    hub_token="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",
    save_strategy="epoch",            # Lưu model sau mỗi epoch
    metric_for_best_model="f1",       # Chọn model tốt nhất theo F1-score
    greater_is_better=True,           # F1 càng lớn càng tốt
    seed=42,                          # Đặt seed để tái lập kết quả
    report_to= "none"
)

In [84]:
def compute_metrics_fn(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[expanded_labels[l] for l in label if l != -100] for label in labels]
    true_predictions = [[expanded_labels[pred] for (pred, lab) in zip(pred_row, label_row) if lab != -100]
                        for pred_row, label_row in zip(predictions, labels)]

    return {
        "precision": precision_score(true_labels, true_predictions, zero_division=0),
        "recall": recall_score(true_labels, true_predictions, zero_division=0),
        "f1": f1_score(true_labels, true_predictions, zero_division=0),
    }


In [85]:
trainer = TokenClassificationTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_fn
)

# Chạy fine-tuning
trainer.train()

  trainer = TokenClassificationTrainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.0457,0.041853,0.881307,0.885428,0.883363
2,0.0356,0.02554,0.928585,0.935185,0.931873
3,0.0242,0.021473,0.93806,0.949027,0.943512


TrainOutput(global_step=7971, training_loss=0.04650249222921855, metrics={'train_runtime': 3160.775, 'train_samples_per_second': 80.673, 'train_steps_per_second': 2.522, 'total_flos': 1.0511065398432984e+16, 'train_loss': 0.04650249222921855, 'epoch': 3.0})

In [87]:
trainer.push_to_hub(commit_message="Upload fine-tuned model")

Uploading...:   0%|          | 0.00/538M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AnTrinh/my-phobert-ner/commit/96c15ae947a83b0fa9423534aa6d18c0401fdbd2', commit_message='Upload fine-tuned model', commit_description='', oid='96c15ae947a83b0fa9423534aa6d18c0401fdbd2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AnTrinh/my-phobert-ner', endpoint='https://huggingface.co', repo_type='model', repo_id='AnTrinh/my-phobert-ner'), pr_revision=None, pr_num=None)