# Train PhoBERT NER tr√™n GPU RTX 4050

Training PhoBERT cho Named Entity Recognition v·ªõi GPU acceleration.

In [5]:
import torch
import json
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

print("="*60)
print("üîç GPU Status")
print("="*60)
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    device = "cuda"
else:
    print("‚ö†Ô∏è No GPU detected, using CPU")
    device = "cpu"
print("="*60)

üîç GPU Status
PyTorch: 2.6.0+cu124
CUDA available: True
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
VRAM: 6.00 GB


## 1. Load d·ªØ li·ªáu

In [6]:
# Load JSON data
print("üìÇ Loading data...")

with open("../../data/processed/train_phobert.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

with open("../../data/processed/val_phobert.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

print(f"‚úì Train: {len(train_data)} sentences")
print(f"‚úì Val: {len(val_data)} sentences")

# Sample
print(f"\nüìù Sample:")
print(f"Tokens: {train_data[0]['tokens'][:5]}...")
print(f"Tags: {train_data[0]['ner_tags'][:5]}...")

üìÇ Loading data...
‚úì Train: 800 sentences
‚úì Val: 201 sentences

üìù Sample:
Tokens: ['C√≥', 'ph·∫£i', 'S·ªët', 'ƒë√¥i', 'khi']...
Tags: ['O', 'O', 'B-SYMPTOM', 'I-SYMPTOM', 'I-SYMPTOM']...


## 2. T·∫°o label mapping

In [7]:
# Get all unique labels
all_labels = set()
for item in train_data + val_data:
    all_labels.update(item['ner_tags'])

label_list = sorted(list(all_labels))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(f"üè∑Ô∏è  Labels ({len(label_list)}): {label_list}")
print(f"\nüìä Label mapping:")
for label, idx in label2id.items():
    print(f"  {idx}: {label}")

üè∑Ô∏è  Labels (8): ['B-DISEASE', 'B-SYMPTOM', 'I-DISEASE', 'I-I-SYMPTOM', 'I-SYMPTEM', 'I-SYMPTOM', 'O', 'T√¥i']

üìä Label mapping:
  0: B-DISEASE
  1: B-SYMPTOM
  2: I-DISEASE
  3: I-I-SYMPTOM
  4: I-SYMPTEM
  5: I-SYMPTOM
  6: O
  7: T√¥i


## 3. Convert labels sang IDs

In [8]:
# Convert labels to IDs
def convert_labels_to_ids(data, label2id):
    converted = []
    for item in data:
        converted.append({
            "tokens": item["tokens"],
            "ner_tags": [label2id[label] for label in item["ner_tags"]]
        })
    return converted

train_data = convert_labels_to_ids(train_data, label2id)
val_data = convert_labels_to_ids(val_data, label2id)

print("‚úì Converted labels to IDs")
print(f"\nSample after conversion:")
print(f"Tokens: {train_data[0]['tokens'][:5]}")
print(f"Tag IDs: {train_data[0]['ner_tags'][:5]}")

‚úì Converted labels to IDs

Sample after conversion:
Tokens: ['C√≥', 'ph·∫£i', 'S·ªët', 'ƒë√¥i', 'khi']
Tag IDs: [6, 6, 1, 5, 5]


## 4. T·∫°o Hugging Face Dataset

In [9]:
# Convert to HF Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print(f"‚úì Created HF Datasets")
print(f"\nüìä Dataset info:")
print(train_dataset)

‚úì Created HF Datasets

üìä Dataset info:
Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 800
})


## 5. Load PhoBERT

In [10]:
model_checkpoint = "vinai/phobert-base"

print(f"üì• Loading {model_checkpoint}...")

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Move to GPU
model = model.to(device)

print(f"\n‚úÖ Model loaded on {device}")
print(f"Parameters: {model.num_parameters():,}")

üì• Loading vinai/phobert-base...


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



‚úÖ Model loaded on cuda
Parameters: 134,413,832


## 6. Tokenize v√† align labels

In [11]:
def tokenize_and_align_labels(examples):
    """
    Tokenize text v√† align labels v·ªõi subword tokens
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=256,
        padding=False  # Kh√¥ng padding ·ªü ƒë√¢y, ƒë·ªÉ collator l√†m
    )
    
    labels = []
    
    for i, label in enumerate(examples["ner_tags"]):
        # PhoBERT tokenizer kh√¥ng c√≥ word_ids() ‚Üí ph·∫£i t·ª± align
        word_ids = []
        current_word_idx = 0
        
        # L·∫•y input_ids c·ªßa c√¢u n√†y
        input_ids = tokenized_inputs["input_ids"][i]
        tokens_from_ids = tokenizer.convert_ids_to_tokens(input_ids)
        
        for idx, token in enumerate(tokens_from_ids):
            # Special tokens
            if token in ["<s>", "</s>", "<pad>"]:
                word_ids.append(None)
            # Subword token (b·∫Øt ƒë·∫ßu v·ªõi @@)
            elif token.startswith("@@"):
                word_ids.append(current_word_idx - 1)  # Thu·ªôc t·ª´ tr∆∞·ªõc ƒë√≥
            # Normal token
            else:
                word_ids.append(current_word_idx)
                current_word_idx += 1
        
        # Align labels
        label_ids = []
        previous_word_idx = None
        
        for word_idx in word_ids:
            # Special token ‚Üí -100
            if word_idx is None:
                label_ids.append(-100)
            # T·ª´ m·ªõi ‚Üí g√°n label
            elif word_idx != previous_word_idx:
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            # Subword c·ªßa t·ª´ c≈© ‚Üí -100
            else:
                label_ids.append(-100)
            
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

print("üîÑ Tokenizing...")
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val = val_dataset.map(tokenize_and_align_labels, batched=True)

print("‚úì Tokenization complete")

üîÑ Tokenizing...


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 800/800 [00:00<00:00, 5986.31 examples/s]

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 201/201 [00:00<00:00, 6564.88 examples/s]

‚úì Tokenization complete





## 7. Data Collator

In [12]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True
)

print("‚úì Data collator ready")

‚úì Data collator ready


## 8. Metrics

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = []
    pred_labels = []
    
    for prediction, label in zip(predictions, labels):
        true_label = []
        pred_label = []
        
        for pred, lab in zip(prediction, label):
            if lab != -100:
                true_label.append(id2label[lab])
                pred_label.append(id2label[pred])
        
        true_labels.append(true_label)
        pred_labels.append(pred_label)
    
    precision = precision_score(true_labels, pred_labels)
    recall = recall_score(true_labels, pred_labels)
    f1 = f1_score(true_labels, pred_labels)
    
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

print("‚úì Metrics function ready")

‚úì Metrics function ready


## 9. Training Arguments - T·ªëi ∆∞u cho RTX 4050 (6GB)

In [14]:
import os

training_args = TrainingArguments(
    output_dir="../../models/phobert_ner_checkpoints",
    
    # ===== GPU OPTIMIZATION =====
    per_device_train_batch_size=16,      # RTX 4050 6GB c√≥ th·ªÉ handle
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,        # Effective batch = 16 * 2 = 32
    fp16=True,                            # Mixed Precision (ti·∫øt ki·ªám VRAM)
    dataloader_num_workers=4,             # TƒÉng t·ªëc data loading
    dataloader_pin_memory=True,
    
    # ===== TRAINING PARAMS =====
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    max_grad_norm=1.0,
    
    # ===== EVALUATION ===== (ƒê√É S·ª¨A)
    eval_strategy="epoch",                # ƒê·ªïi t·ª´ evaluation_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    
    # ===== LOGGING =====
    logging_dir="../../models/logs",
    logging_steps=20,
    logging_first_step=True,
    
    # ===== SAVING =====
    save_total_limit=2,  # Ch·ªâ gi·ªØ 2 checkpoint t·ªët nh·∫•t
    
    # ===== OTHER =====
    seed=42,
    report_to="none",
)

print("‚úÖ Training Arguments:")
print(f"   Device: {training_args.device}")
print(f"   FP16: {training_args.fp16}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"   Effective batch: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Epochs: {training_args.num_train_epochs}")

‚úÖ Training Arguments:
   Device: cuda:0
   FP16: True
   Batch size: 16
   Gradient accumulation: 2
   Effective batch: 32
   Epochs: 5


## 10. Trainer

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("‚úÖ Trainer ready")

  trainer = Trainer(


‚úÖ Trainer ready


## 11. START TRAINING üöÄ

In [16]:
print("\n" + "="*60)
print("üöÄ B·∫ÆT ƒê·∫¶U TRAINING")
print("="*60)

# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("üßπ GPU cache cleared")

# Train
train_result = trainer.train()

print("\n" + "="*60)
print("‚úÖ TRAINING COMPLETE")
print("="*60)
print(f"Loss: {train_result.training_loss:.4f}")
print(f"Time: {train_result.metrics['train_runtime']:.2f}s ({train_result.metrics['train_runtime']/60:.1f} mins)")


üöÄ B·∫ÆT ƒê·∫¶U TRAINING
üßπ GPU cache cleared


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.6382,0.702997,0.657471,0.718593,0.686675
2,0.6395,0.255503,0.835381,0.854271,0.84472
3,0.2856,0.211509,0.851852,0.866834,0.859278
4,0.1917,0.202463,0.856079,0.866834,0.861423
5,0.2032,0.193566,0.862843,0.869347,0.866083





‚úÖ TRAINING COMPLETE
Loss: 0.5220
Time: 321.66s (5.4 mins)


## 12. Evaluate

In [17]:
print("\nüìä Evaluating...")
eval_results = trainer.evaluate()

print("\n‚úÖ Results:")
print(f"   Precision: {eval_results['eval_precision']:.4f}")
print(f"   Recall: {eval_results['eval_recall']:.4f}")
print(f"   F1-score: {eval_results['eval_f1']:.4f}")


üìä Evaluating...



‚úÖ Results:
   Precision: 0.8628
   Recall: 0.8693
   F1-score: 0.8661




## 13. Save Model

In [18]:
print("\nüíæ Saving model...")

output_dir = "../../models/phobert_ner_model"
os.makedirs(output_dir, exist_ok=True)

# Save model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# Save label mapping
label_info = {
    "label2id": label2id,
    "id2label": {str(k): v for k, v in id2label.items()}
}

with open(f"{output_dir}/label_mapping.json", "w", encoding="utf-8") as f:
    json.dump(label_info, f, ensure_ascii=False, indent=2)

print(f"‚úÖ Saved to: {output_dir}")

# Clear GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("üßπ GPU cache cleared")


üíæ Saving model...
‚úÖ Saved to: ../../models/phobert_ner_model
üßπ GPU cache cleared
‚úÖ Saved to: ../../models/phobert_ner_model
üßπ GPU cache cleared


## 14. Test

In [20]:
print("\nüß™ TEST")
print("="*60)

test_sentence = "T√¥i b·ªã s·ªët cao, ƒëau ƒë·∫ßu v√† ho nhi·ªÅu"

# Tokenize
inputs = tokenizer(test_sentence, return_tensors="pt").to(device)

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Decode
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
predicted_labels = [id2label[p.item()] for p in predictions[0]]

print(f"\nC√¢u test: {test_sentence}")

# Debug: Xem c√°ch PhoBERT tokenize
print("\n" + "="*60)
print("DEBUG: C√°ch PhoBERT tokenize c√¢u")
print("="*60)
print(f"Input text: {test_sentence}")
print(f"Tokens t·ª´ tokenizer.tokenize(): {tokenizer.tokenize(test_sentence)}")
print(f"\nTokens t·ª´ convert_ids_to_tokens(): {tokens}")

print("\n" + "="*60)
print("K·∫æT QU·∫¢ (Tokens th√¥ t·ª´ PhoBERT):")
print("="*60)
for token, label in zip(tokens, predicted_labels):
    if token not in ['<s>', '</s>', '<pad>']:
        print(f"  {token:20s} -> {label}")

# ===== DECODE ƒê√öNG: Gh√©p subword l·∫°i =====
def decode_tokens_and_labels(tokens, labels):
    """Gh√©p subword tokens th√†nh t·ª´ g·ªëc (PhoBERT style)"""
    words = []
    word_labels = []
    current_word = ""
    current_label = None
    
    for token, label in zip(tokens, labels):
        # B·ªè qua special tokens
        if token in ['<s>', '</s>', '<pad>']:
            continue
        
        # QUAN TR·ªåNG: PhoBERT ƒë√°nh d·∫•u @@ ·ªû CU·ªêI token (kh√¥ng ph·∫£i ƒë·∫ßu)
        # V√≠ d·ª•: "cao@@" nghƒ©a l√† "cao" + subword ti·∫øp theo
        if token.endswith('@@'):
            # L∆∞u t·ª´ tr∆∞·ªõc (n·∫øu c√≥)
            if current_word:
                words.append(current_word)
                word_labels.append(current_label)
            # Token n√†y k·∫øt th√∫c b·∫±ng @@ ‚Üí t·ª´ ch∆∞a ho√†n ch·ªânh
            current_word = token[:-2]  # B·ªè @@
            current_label = label
        # Token b·∫Øt ƒë·∫ßu v·ªõi @@ (ti·∫øp t·ª•c t·ª´ tr∆∞·ªõc)
        elif token.startswith('@@'):
            current_word += token[2:]  # B·ªè @@
        # Token th∆∞·ªùng
        else:
            # N·∫øu c√≥ t·ª´ tr∆∞·ªõc ch∆∞a ho√†n ch·ªânh ‚Üí gh√©p ti·∫øp
            if current_word and current_word != "":
                current_word += token
            # N·∫øu kh√¥ng ‚Üí b·∫Øt ƒë·∫ßu t·ª´ m·ªõi
            else:
                # L∆∞u t·ª´ tr∆∞·ªõc (n·∫øu c√≥)
                if current_word:
                    words.append(current_word)
                    word_labels.append(current_label)
                current_word = token
                current_label = label
    
    # L∆∞u t·ª´ cu·ªëi
    if current_word:
        words.append(current_word)
        word_labels.append(current_label)
    
    return words, word_labels

words, word_labels = decode_tokens_and_labels(tokens, predicted_labels)

print("\n" + "="*60)
print("K·∫æT QU·∫¢ (ƒê√£ gh√©p subword - ƒê√öNG):")
print("="*60)
for word, label in zip(words, word_labels):
    print(f"  {word:20s} -> {label}")

print("\n‚úÖ Ho√†n t·∫•t!")
print("\nüìò Gi·∫£i th√≠ch PhoBERT tokenization:")
print("- PhoBERT c√≥ th·ªÉ ƒë√°nh d·∫•u @@ ·ªü 2 v·ªã tr√≠:")
print("  + Cu·ªëi token: 'cao@@' ‚Üí t·ª´ ch∆∞a ho√†n ch·ªânh, c√≥ subword ti·∫øp theo")
print("  + ƒê·∫ßu token: '@@t' ‚Üí subword ti·∫øp theo c·ªßa t·ª´ tr∆∞·ªõc")
print("- V√≠ d·ª•: 'information' ‚Üí ['in', '@@for', '@@mation']")
print("- Ho·∫∑c: 'cao nh·∫•t' ‚Üí ['cao@@', 'nh·∫•t'] (trong m·ªôt s·ªë tr∆∞·ªùng h·ª£p)")


üß™ TEST

C√¢u test: T√¥i b·ªã s·ªët cao, ƒëau ƒë·∫ßu v√† ho nhi·ªÅu

DEBUG: C√°ch PhoBERT tokenize c√¢u
Input text: T√¥i b·ªã s·ªët cao, ƒëau ƒë·∫ßu v√† ho nhi·ªÅu
Tokens t·ª´ tokenizer.tokenize(): ['T√¥i', 'b·ªã', 's·ªët', 'cao@@', ',', 'ƒëau', 'ƒë·∫ßu', 'v√†', 'ho', 'nhi·ªÅu']

Tokens t·ª´ convert_ids_to_tokens(): ['<s>', 'T√¥i', 'b·ªã', 's·ªët', 'cao@@', ',', 'ƒëau', 'ƒë·∫ßu', 'v√†', 'ho', 'nhi·ªÅu', '</s>']

K·∫æT QU·∫¢ (Tokens th√¥ t·ª´ PhoBERT):
  T√¥i                  -> O
  b·ªã                   -> O
  s·ªët                  -> B-SYMPTOM
  cao@@                -> I-SYMPTOM
  ,                    -> O
  ƒëau                  -> B-SYMPTOM
  ƒë·∫ßu                  -> I-SYMPTOM
  v√†                   -> O
  ho                   -> B-SYMPTOM
  nhi·ªÅu                -> I-SYMPTOM

K·∫æT QU·∫¢ (ƒê√£ gh√©p subword - ƒê√öNG):
  T√¥ib·ªãs·ªët             -> O
  cao,ƒëauƒë·∫ßuv√†honhi·ªÅu  -> I-SYMPTOM

‚úÖ Ho√†n t·∫•t!

üìò Gi·∫£i th√≠ch PhoBERT tokenization:
- PhoBERT c√≥ th·ªÉ ƒ