### Fine-tune PhoBERT for Text Classification

In [8]:
!pip install torch torchvision torchaudio --quiet
!pip install transformers --quiet
!pip install scikit-learn --quiet

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, RobertaTokenizer, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

### Load tokenizer + model (PhoBERT-base)

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "vinai/phobert-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, trust_remote_code=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Class đọc .jsonl 

In [10]:
import json

class JsonlDataset(Dataset):
    def __init__(self, path, tokenizer, max_len=256):
        self.samples = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                self.samples.append(json.loads(line))
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]

        # Ghép các message thành 1 đoạn text duy nhất
        text = " ".join([m["content"] for m in sample["messages"]])
        
        # Ánh xạ label từ text -> số
        label_map = {"no": 0, "extrinsic": 1, "intrinsic": 2}
        label = label_map[sample["label"]]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

### Load dữ liệu train/val/test

In [11]:
train_dataset = JsonlDataset("../train.jsonl", tokenizer)
val_dataset   = JsonlDataset("../val.jsonl", tokenizer)
test_dataset  = JsonlDataset("../test.jsonl", tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

### Cấu hình train

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

num_training_steps = num_epochs * len(train_loader)
scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

### Training loop

In [13]:
from tqdm import trange

save_dir = "./checkpoints"
os.makedirs(save_dir, exist_ok=True)

for epoch in range(num_epochs):
    print(f"\n===== Epoch {epoch+1}/{num_epochs} =====")
    
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, desc="Training", leave=True)
    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
        
        # Tính loss trung bình
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Lưu checkpoint
        epoch_save_path = os.path.join(save_dir, f"epoch_{epoch+1}")
        os.makedirs(epoch_save_path, exist_ok=True)
        model.save_pretrained(epoch_save_path)
        tokenizer.save_pretrained(epoch_save_path)
        print(f"Saved checkpoint to {epoch_save_path}")

    
    avg_train_loss = total_loss / len(train_loader)


===== Epoch 1/3 =====


Training:   0%|          | 0/10 [00:13<?, ?it/s, loss=1.08]

Average training loss: 0.1077


Training:  10%|█         | 1/10 [00:15<02:15, 15.09s/it, loss=1.08]

Saved checkpoint to ./checkpoints\epoch_1


Training:  10%|█         | 1/10 [00:24<02:15, 15.09s/it, loss=1.12]

Average training loss: 0.2194


Training:  20%|██        | 2/10 [00:24<01:35, 11.88s/it, loss=1.12]

Saved checkpoint to ./checkpoints\epoch_1


Training:  20%|██        | 2/10 [00:33<01:35, 11.88s/it, loss=1.12]

Average training loss: 0.3318


Training:  30%|███       | 3/10 [00:34<01:15, 10.75s/it, loss=1.12]

Saved checkpoint to ./checkpoints\epoch_1


Training:  30%|███       | 3/10 [00:44<01:15, 10.75s/it, loss=1.1] 

Average training loss: 0.4422


Training:  40%|████      | 4/10 [00:45<01:05, 10.99s/it, loss=1.1]

Saved checkpoint to ./checkpoints\epoch_1


Training:  40%|████      | 4/10 [00:53<01:05, 10.99s/it, loss=1.13]

Average training loss: 0.5548


Training:  50%|█████     | 5/10 [00:54<00:51, 10.31s/it, loss=1.13]

Saved checkpoint to ./checkpoints\epoch_1


Training:  50%|█████     | 5/10 [01:02<00:51, 10.31s/it, loss=1.08]

Average training loss: 0.6627


Training:  60%|██████    | 6/10 [01:03<00:39,  9.82s/it, loss=1.08]

Saved checkpoint to ./checkpoints\epoch_1


Training:  60%|██████    | 6/10 [01:11<00:39,  9.82s/it, loss=1.15]

Average training loss: 0.7776


Training:  70%|███████   | 7/10 [01:12<00:28,  9.50s/it, loss=1.15]

Saved checkpoint to ./checkpoints\epoch_1


Training:  70%|███████   | 7/10 [01:20<00:28,  9.50s/it, loss=1.1] 

Average training loss: 0.8879


Training:  80%|████████  | 8/10 [01:21<00:18,  9.25s/it, loss=1.1]

Saved checkpoint to ./checkpoints\epoch_1


Training:  80%|████████  | 8/10 [01:29<00:18,  9.25s/it, loss=1.08]

Average training loss: 0.9964


Training:  90%|█████████ | 9/10 [01:29<00:09,  9.14s/it, loss=1.08]

Saved checkpoint to ./checkpoints\epoch_1


Training:  90%|█████████ | 9/10 [01:37<00:09,  9.14s/it, loss=1.13]

Average training loss: 1.1096


Training: 100%|██████████| 10/10 [01:37<00:00,  9.79s/it, loss=1.13]


Saved checkpoint to ./checkpoints\epoch_1

===== Epoch 2/3 =====


Training:   0%|          | 0/10 [00:08<?, ?it/s, loss=1.06]

Average training loss: 0.1063


Training:  10%|█         | 1/10 [00:09<01:23,  9.25s/it, loss=1.06]

Saved checkpoint to ./checkpoints\epoch_2


Training:  10%|█         | 1/10 [00:19<01:23,  9.25s/it, loss=1.02]

Average training loss: 0.2081


Training:  20%|██        | 2/10 [00:20<01:22, 10.26s/it, loss=1.02]

Saved checkpoint to ./checkpoints\epoch_2


Training:  20%|██        | 2/10 [00:31<01:22, 10.26s/it, loss=1.08]

Average training loss: 0.3158


Training:  30%|███       | 3/10 [00:32<01:18, 11.14s/it, loss=1.08]

Saved checkpoint to ./checkpoints\epoch_2


Training:  30%|███       | 3/10 [00:43<01:18, 11.14s/it, loss=1.08]

Average training loss: 0.4238


Training:  40%|████      | 4/10 [00:44<01:09, 11.52s/it, loss=1.08]

Saved checkpoint to ./checkpoints\epoch_2


Training:  40%|████      | 4/10 [00:55<01:09, 11.52s/it, loss=1.09]

Average training loss: 0.5328


Training:  50%|█████     | 5/10 [00:56<00:58, 11.68s/it, loss=1.09]

Saved checkpoint to ./checkpoints\epoch_2


Training:  50%|█████     | 5/10 [01:07<00:58, 11.68s/it, loss=1.03]

Average training loss: 0.6362


Training:  60%|██████    | 6/10 [01:08<00:47, 11.89s/it, loss=1.03]

Saved checkpoint to ./checkpoints\epoch_2


Training:  60%|██████    | 6/10 [01:18<00:47, 11.89s/it, loss=1.07]

Average training loss: 0.7436


Training:  70%|███████   | 7/10 [01:18<00:33, 11.25s/it, loss=1.07]

Saved checkpoint to ./checkpoints\epoch_2


Training:  70%|███████   | 7/10 [01:26<00:33, 11.25s/it, loss=1.06]

Average training loss: 0.8498


Training:  80%|████████  | 8/10 [01:27<00:20, 10.49s/it, loss=1.06]

Saved checkpoint to ./checkpoints\epoch_2


Training:  80%|████████  | 8/10 [01:36<00:20, 10.49s/it, loss=1.05]

Average training loss: 0.9552


Training:  90%|█████████ | 9/10 [01:37<00:10, 10.23s/it, loss=1.05]

Saved checkpoint to ./checkpoints\epoch_2


Training:  90%|█████████ | 9/10 [01:46<00:10, 10.23s/it, loss=1.03]

Average training loss: 1.0583


Training: 100%|██████████| 10/10 [01:47<00:00, 10.77s/it, loss=1.03]


Saved checkpoint to ./checkpoints\epoch_2

===== Epoch 3/3 =====


Training:   0%|          | 0/10 [00:11<?, ?it/s, loss=1.05]

Average training loss: 0.1048


Training:  10%|█         | 1/10 [00:11<01:45, 11.71s/it, loss=1.05]

Saved checkpoint to ./checkpoints\epoch_3


Training:  10%|█         | 1/10 [00:22<01:45, 11.71s/it, loss=1.06]

Average training loss: 0.2104


Training:  20%|██        | 2/10 [00:23<01:34, 11.85s/it, loss=1.06]

Saved checkpoint to ./checkpoints\epoch_3


Training:  20%|██        | 2/10 [00:34<01:34, 11.85s/it, loss=1.01]

Average training loss: 0.3118


Training:  30%|███       | 3/10 [00:35<01:23, 11.89s/it, loss=1.01]

Saved checkpoint to ./checkpoints\epoch_3


Training:  30%|███       | 3/10 [00:46<01:23, 11.89s/it, loss=1.1] 

Average training loss: 0.4217


Training:  40%|████      | 4/10 [00:47<01:11, 11.87s/it, loss=1.1]

Saved checkpoint to ./checkpoints\epoch_3


Training:  40%|████      | 4/10 [00:58<01:11, 11.87s/it, loss=1.05]

Average training loss: 0.5272


Training:  50%|█████     | 5/10 [00:59<00:59, 11.82s/it, loss=1.05]

Saved checkpoint to ./checkpoints\epoch_3


Training:  50%|█████     | 5/10 [01:10<00:59, 11.82s/it, loss=1.02]

Average training loss: 0.6291


Training:  60%|██████    | 6/10 [01:11<00:48, 12.08s/it, loss=1.02]

Saved checkpoint to ./checkpoints\epoch_3


Training:  60%|██████    | 6/10 [01:21<00:48, 12.08s/it, loss=1.08]

Average training loss: 0.7367


Training:  70%|███████   | 7/10 [01:22<00:34, 11.57s/it, loss=1.08]

Saved checkpoint to ./checkpoints\epoch_3


Training:  70%|███████   | 7/10 [01:30<00:34, 11.57s/it, loss=1]   

Average training loss: 0.8367


Training:  80%|████████  | 8/10 [01:31<00:21, 10.79s/it, loss=1]

Saved checkpoint to ./checkpoints\epoch_3


Training:  80%|████████  | 8/10 [01:39<00:21, 10.79s/it, loss=1.06]

Average training loss: 0.9431


Training:  90%|█████████ | 9/10 [01:40<00:10, 10.19s/it, loss=1.06]

Saved checkpoint to ./checkpoints\epoch_3


Training:  90%|█████████ | 9/10 [01:47<00:10, 10.19s/it, loss=1.07]

Average training loss: 1.0501


Training: 100%|██████████| 10/10 [01:48<00:00, 10.81s/it, loss=1.07]

Saved checkpoint to ./checkpoints\epoch_3





In [15]:
SAVE_DIR = "./model"

tokenizer.save_pretrained(SAVE_DIR)
model.save_pretrained(SAVE_DIR)