In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import torch # Import torch at a higher level
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from collections import Counter
import unicodedata # Kept if you plan to use it later for more advanced text cleaning
import re # Kept for potential advanced text cleaning
from google.colab import drive
from huggingface_hub import login


# --- Configuration ---
MODEL_NAME = "uitnlp/CafeBERT"
OUTPUT_DIR = '/content/drive/MyDrive/ChongPha_Ver2/VisoBERT/OUTPUT'
LOGGING_DIR = '/content/drive/MyDrive/ChongPha_Ver2/VisoBERT/LOGGING'
DATASET_PATH = "/content/drive/MyDrive/ChongPha_Ver2/Dataset_ChongPha.csv"

drive.mount('/content/drive')
# login(token='YOUR_HF_TOKEN_HERE')

# Define labels and their mapping (ensure this order is consistent)
# This mapping will be used by the Dataset class and for the classification report
LABELS = ["PHAN_DONG", "KHONG_PHAN_DONG", "KHONG_LIEN_QUAN"]
LABEL_TO_ID = {label: i for i, label in enumerate(LABELS)}
ID_TO_LABEL = {i: label for i, label in enumerate(LABELS)}
NUM_LABELS = len(LABELS)

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
ACTUAL_CLS_TOKEN = tokenizer.cls_token # e.g., '<s>' for PhoBERT
ACTUAL_SEP_TOKEN = tokenizer.sep_token # e.g., '</s>' for PhoBERT
print(f"Using CLS token: '{ACTUAL_CLS_TOKEN}', SEP token: '{ACTUAL_SEP_TOKEN}'")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading tokenizer: uitnlp/CafeBERT


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Using CLS token: '<s>', SEP token: '</s>'


In [2]:
from sklearn.preprocessing import LabelEncoder
def load_and_prepare_data(file_path, cls_token, sep_token, combine_summary_comment=True):
    df = pd.read_csv(file_path)
    if combine_summary_comment:
        df['summary'] = df['summary'].astype(str).fillna('')
        df['comment_clean'] = df['comment_clean'].astype(str).fillna('')

        # Manually construct the full string with all special tokens
        df['text'] = f"{cls_token} " + df['summary'] + f" {sep_token} " + df['comment_clean'] + f" {sep_token}"
    else: # Fallback for single comment - add CLS and SEP
        df['comment_clean'] = df['comment'].astype(str).fillna('') # Assuming 'comment' if not combining
        df['text'] = f"{cls_token} " + df['comment_clean'] + f" {sep_token}"

    df['label'] = df['label'].astype(str)
    # Assuming LABELS is defined globally for validation
    unknown_labels = set(df['label']) - set(LABELS)
    if unknown_labels:
        raise ValueError(f"Unknown labels found in {file_path}: {unknown_labels}. Expected labels are: {LABELS}")
    return df['text'], df['label']
print("Loading data...")

try:
    # Unpack the tuple returned by load_and_prepare_data
    data_texts, data_labels_str = load_and_prepare_data(DATASET_PATH, cls_token=ACTUAL_CLS_TOKEN, sep_token=ACTUAL_SEP_TOKEN)

    train_texts, temp_texts, train_labels_str, temp_labels_str = train_test_split(
        data_texts,          # Features
        data_labels_str,     # Labels (original strings)
        test_size=0.2,       # 30% for temp_texts and temp_labels_str
        random_state=2025,     # For reproducibility
        stratify=data_labels_str # Ensures proportional distribution based on these labels
    )

    # --- Second Split: (Validation vs. Test from Temp) ---
    # Splitting the 30% temporary set into 50% validation and 50% test
    # (which is 15% of original for val, 15% of original for test)
    # We stratify based on 'temp_labels_str'.
    val_texts, test_texts, val_labels_str, test_labels_str = train_test_split(
        temp_texts,          # Features from the temporary set
        temp_labels_str,     # Labels from the temporary set (original strings)
        test_size=0.5,       # 50% of temp_texts for test_texts
        random_state=2025,     # For reproducibility
        stratify=temp_labels_str  # Ensures proportional distribution



    # train_texts, train_labels_str = load_and_prepare_data("/content/drive/MyDrive/Data/splits/train.csv",cls_token=ACTUAL_CLS_TOKEN, sep_token=ACTUAL_SEP_TOKEN)
    # val_texts, val_labels_str = load_and_prepare_data("/content/drive/MyDrive/Data/splits/val.csv",cls_token=ACTUAL_CLS_TOKEN, sep_token=ACTUAL_SEP_TOKEN)
    # test_texts, test_labels_str = load_and_prepare_data("/content/drive/MyDrive/Data/splits/val.csv",cls_token=ACTUAL_CLS_TOKEN, sep_token=ACTUAL_SEP_TOKEN)
    )

    print(f"Total examples: {len(data_texts)}")
    print(f"Train examples: {len(train_texts)}, Validation examples: {len(val_texts)}, Test examples: {len(test_texts)}")

    # --- Verification of Label Distribution ---
    print("\nOriginal dataset label distribution:")
    print(pd.Series(data_labels_str).value_counts(normalize=True).sort_index())

    print("\nTraining set label distribution:")
    print(pd.Series(train_labels_str).value_counts(normalize=True).sort_index())

    print("\nValidation set label distribution:")
    print(pd.Series(val_labels_str).value_counts(normalize=True).sort_index())

    print("\nTest set label distribution:")
    print(pd.Series(test_labels_str).value_counts(normalize=True).sort_index())
    print(f"Sample train text: {train_texts.iloc[0]}...") # Print a sample
    print(f"Sample train label: {train_labels_str.iloc[0]}")
    print(f"Unique labels in training data: {train_labels_str.unique()}")

except FileNotFoundError:
    print("Error: One or more data files not found. Please check file paths.")
    # Exiting or using dummy data if files are not present
    # For demonstration, let's assume the script should stop if data isn't found
    exit()
except ValueError as ve:
    print(ve)
    exit()

Loading data...
Total examples: 18912
Train examples: 15129, Validation examples: 1891, Test examples: 1892

Original dataset label distribution:
label
KHONG_LIEN_QUAN    0.528976
KHONG_PHAN_DONG    0.356599
PHAN_DONG          0.114425
Name: proportion, dtype: float64

Training set label distribution:
label
KHONG_LIEN_QUAN    0.528984
KHONG_PHAN_DONG    0.356600
PHAN_DONG          0.114416
Name: proportion, dtype: float64

Validation set label distribution:
label
KHONG_LIEN_QUAN    0.528821
KHONG_PHAN_DONG    0.356425
PHAN_DONG          0.114754
Name: proportion, dtype: float64

Test set label distribution:
label
KHONG_LIEN_QUAN    0.529070
KHONG_PHAN_DONG    0.356765
PHAN_DONG          0.114165
Name: proportion, dtype: float64
Sample train text: <s> 1. Nội dung sơ lược: Bài viết thể hiện sự bất bình về việc chính phủ Việt Nam ăn mừng ngày 30/4, cho rằng đó là ngày 'nồi da xáo thịt' và lên án việc 'nhồi sọ' thế hệ trẻ về lịch sử, đặc biệt là về mối quan hệ với Trung Quốc.

2. Vấn đề: P

In [3]:
# --- 2. Tokenization ---
MAX_LENGTH = 384  # Max sequence length for tokenizer

print("Tokenizing texts...")
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH)

Tokenizing texts...


In [4]:
# --- 3. Custom Dataset ---
class CommentDataset(Dataset):
    def __init__(self, encodings, string_labels):
        self.encodings = encodings
        if hasattr(string_labels, 'tolist'): # Handles pandas Series
            self.string_labels = string_labels.tolist()
        else:
            self.string_labels = string_labels

        # Validate labels during initialization
        if not all(isinstance(label, str) for label in self.string_labels):
            raise ValueError("All labels must be strings.")
        if not all(label in LABEL_TO_ID for label in self.string_labels):
            unknown = set(self.string_labels) - set(LABEL_TO_ID.keys())
            raise ValueError(f"Unknown labels found: {unknown}. Known: {list(LABEL_TO_ID.keys())}")

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        label_str = self.string_labels[idx]
        item['labels'] = torch.tensor(LABEL_TO_ID[label_str], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.string_labels)

def token_analyzing(all_texts):
    """Simple percentile analysis for all sequences"""

    # Get actual lengths for all texts
    all_lengths = []
    for text in all_texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        all_lengths.append(len(tokens))

    # Percentile analysis
    percentiles = [50, 75, 90, 95, 98, 99, 99.5, 99.9]
    print(f"PERCENTILE ANALYSIS:")
    for p in percentiles:
        value = np.percentile(all_lengths, p)
        print(f"   {p:4.1f}th percentile: {value:6.0f} tokens")

    return int(np.percentile(all_lengths, 98))

print("Creating datasets...")

# Combine all texts for analysis
all_texts = train_texts.tolist() + val_texts.tolist() + test_texts.tolist()
recommended_98th = token_analyzing(all_texts)

print(f"\nRecommended MAX_LENGTH for 98% coverage: {recommended_98th}")
print(f"Current MAX_LENGTH: {MAX_LENGTH}")

if not train_texts.empty:
    train_dataset = CommentDataset(train_encodings, train_labels_str)
else:
    print("Train dataset is empty. Cannot create CommentDataset for training.")
    train_dataset = None

if not val_texts.empty:
    val_dataset = CommentDataset(val_encodings, val_labels_str)
else:
    print("Validation dataset is empty. Cannot create CommentDataset for validation.")
    val_dataset = None

if not test_texts.empty:
    test_dataset = CommentDataset(test_encodings, test_labels_str)
else:
    print("Test dataset is empty. Cannot create CommentDataset for testing.")
    test_dataset = None


Creating datasets...


Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors


PERCENTILE ANALYSIS:
   50.0th percentile:    129 tokens
   75.0th percentile:    162 tokens
   90.0th percentile:    200 tokens
   95.0th percentile:    227 tokens
   98.0th percentile:    271 tokens
   99.0th percentile:    307 tokens
   99.5th percentile:    346 tokens
   99.9th percentile:    408 tokens

Recommended MAX_LENGTH for 98% coverage: 271
Current MAX_LENGTH: 384


In [5]:
# Add this cell after your dataset creation
from collections import Counter
from sklearn.utils import resample
import pandas as pd

def oversample_minority_classes(texts, labels, target_ratio=0.8):
    """
    Oversample minority classes to achieve better balance.

    Args:
        texts: Text data (pandas Series)
        labels: Labels (pandas Series)
        target_ratio: Target ratio for minority class relative to majority class

    Returns:
        Balanced texts and labels
    """
    # Convert to DataFrame for easier manipulation
    df = pd.DataFrame({'text': texts, 'label': labels})

    # Count original distribution
    label_counts = Counter(labels)
    print("Original distribution:")
    for label, count in label_counts.items():
        print(f"  {label}: {count}")

    # Find majority class size
    majority_size = max(label_counts.values())
    target_minority_size = int(majority_size * target_ratio)

    balanced_dfs = []

    for label in LABELS:
        label_df = df[df['label'] == label]
        current_size = len(label_df)

        if current_size < target_minority_size:
            # Oversample this class
            n_samples_needed = target_minority_size - current_size
            oversampled = resample(
                label_df,
                n_samples=n_samples_needed,
                random_state=42,
                replace=True
            )
            combined_df = pd.concat([label_df, oversampled], ignore_index=True)
            print(f"  {label}: {current_size} -> {len(combined_df)} (oversampled)")
        else:
            combined_df = label_df
            print(f"  {label}: {current_size} (no change)")

        balanced_dfs.append(combined_df)

    # Combine all classes
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)

    # Shuffle the dataset
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"\nFinal balanced distribution:")
    final_counts = Counter(balanced_df['label'])
    for label, count in final_counts.items():
        print(f"  {label}: {count}")

    return balanced_df['text'], balanced_df['label']

# Apply oversampling to training data only
print("Applying oversampling to training data...")
balanced_train_texts, balanced_train_labels = oversample_minority_classes(
    train_texts,
    train_labels_str,
    target_ratio=0.7  # Minority classes will be 70% of majority class size
)

# Re-tokenize balanced training data
print("Re-tokenizing balanced training data...")
balanced_train_encodings = tokenizer(
    balanced_train_texts.tolist(),
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH
)

# Create new balanced training dataset
balanced_train_dataset = CommentDataset(balanced_train_encodings, balanced_train_labels)

print(f"Original training size: {len(train_dataset)}")
print(f"Balanced training size: {len(balanced_train_dataset)}")

Applying oversampling to training data...
Original distribution:
  KHONG_PHAN_DONG: 5395
  KHONG_LIEN_QUAN: 8003
  PHAN_DONG: 1731
  PHAN_DONG: 1731 -> 5602 (oversampled)
  KHONG_PHAN_DONG: 5395 -> 5602 (oversampled)
  KHONG_LIEN_QUAN: 8003 (no change)

Final balanced distribution:
  PHAN_DONG: 5602
  KHONG_PHAN_DONG: 5602
  KHONG_LIEN_QUAN: 8003
Re-tokenizing balanced training data...
Original training size: 15129
Balanced training size: 19207


In [6]:
# --- Enhanced Metrics for Imbalanced Data ---
def compute_balanced_metrics(pred):
    """Enhanced metrics optimized for imbalanced Vietnamese classification."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Multiple F1 variants for comparison
    weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)

    # Overall accuracy
    acc = accuracy_score(labels, preds)

    # Per-class metrics (crucial for imbalanced data)
    per_class_precision, per_class_recall, per_class_f1, _ = precision_recall_fscore_support(
        labels, preds, average=None, zero_division=0
    )

    # Balanced accuracy (better than regular accuracy for imbalanced data)
    from sklearn.metrics import balanced_accuracy_score
    balanced_acc = balanced_accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'balanced_accuracy': balanced_acc,

        # F1 variants
        'f1_weighted': weighted_f1,      # Your current metric (biased)
        'f1_macro': macro_f1,            # BETTER: Equal weight to all classes

        # Overall precision/recall
        'precision_weighted': weighted_precision,
        'recall_weighted': weighted_recall,
        'precision_macro': macro_precision,
        'recall_macro': macro_recall,

        # Per-class metrics (monitor minority class performance)
        'f1_phan_dong': per_class_f1[0],           # Minority class F1
        'f1_khong_phan_dong': per_class_f1[1],
        'f1_khong_lien_quan': per_class_f1[2],     # Majority class F1

        # Per-class recall (important for minority class)
        'recall_phan_dong': per_class_recall[0],   # Critical metric!
        'recall_khong_phan_dong': per_class_recall[1],
        'recall_khong_lien_quan': per_class_recall[2],
    }
from transformers import EarlyStoppingCallback
# Early stopping to prevent overfitting
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,      # Stop if no improvement for 2 epochs
    early_stopping_threshold=0.005  # Minimum improvement threshold
)

In [7]:
# --- 4. Model Loading ---
print(f"Loading model: {MODEL_NAME} for sequence classification with {NUM_LABELS} labels.")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
)
# --- 5. Training Arguments ---
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate= 1e-5,
    num_train_epochs= 6,
    per_device_train_batch_size= 12,
    per_device_eval_batch_size= 24,
    warmup_ratio= 0.15,
    weight_decay= 0.02,
    gradient_accumulation_steps= 3,
    logging_dir=LOGGING_DIR,
    logging_steps=50,
    eval_strategy="epoch", # Changed from evaluation_strategy
    save_strategy="epoch",
    save_total_limit=1,    # Only keep the best model
    bf16=True,
    load_best_model_at_end=True, # Load the best model at the end of training
    metric_for_best_model="eval_f1_macro",
    greater_is_better=True,
    report_to="none", # No external reporting like wandb/tensorboard
)
# --- 7. Trainer Initialization and Training ---
if train_dataset and val_dataset:
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_balanced_metrics,
        callbacks=[early_stopping]
    )

    print("Starting training...")
    trainer.train()
    print("Training finished.")

    # # Save the fine-tuned model and tokenizer
    # print("Saving model and tokenizer...")
    # model.save_pretrained(OUTPUT_DIR + "/best_model")
    # tokenizer.save_pretrained(OUTPUT_DIR + "/best_model")
    # print(f"Model and tokenizer saved to {OUTPUT_DIR}/best_model")

else:
    print("Cannot start training due to empty train or validation dataset.")
    trainer = None # Ensure trainer is None if not initialized

Loading model: uitnlp/CafeBERT for sequence classification with 3 labels.


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at uitnlp/CafeBERT and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,F1 Weighted,F1 Macro,Precision Weighted,Recall Weighted,Precision Macro,Recall Macro,F1 Phan Dong,F1 Khong Phan Dong,F1 Khong Lien Quan,Recall Phan Dong,Recall Khong Phan Dong,Recall Khong Lien Quan
1,0.7431,0.710554,0.69963,0.633035,0.698999,0.642604,0.701572,0.69963,0.658363,0.633035,0.505155,0.645614,0.777044,0.451613,0.682493,0.765
2,0.5952,0.609888,0.739291,0.661792,0.734109,0.678272,0.73362,0.739291,0.703675,0.661792,0.542105,0.681436,0.811276,0.474654,0.661721,0.849
3,0.5141,0.631883,0.746166,0.673503,0.741244,0.690519,0.741271,0.746166,0.716221,0.673503,0.569191,0.687692,0.814674,0.502304,0.663205,0.855
4,0.387,0.680359,0.758329,0.717675,0.757815,0.717864,0.75753,0.758329,0.718245,0.717675,0.623853,0.711916,0.817822,0.626728,0.700297,0.826
5,0.2719,0.76446,0.745108,0.702064,0.745841,0.702457,0.747458,0.745108,0.703651,0.702064,0.594848,0.709261,0.803262,0.585253,0.732938,0.788
6,0.2515,0.819301,0.745637,0.705992,0.746316,0.703035,0.747125,0.745637,0.700267,0.705992,0.597285,0.705969,0.80585,0.608295,0.710682,0.799


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Training finished.


In [11]:
# --- 8. Evaluation on Test Set  ---
if test_dataset and trainer:
    print("\nEvaluating on the test set...")
    results = trainer.evaluate(test_dataset) # This provides eval_loss, eval_accuracy etc. using compute_metrics
    print("Test set evaluation results (from trainer.evaluate):")
    for key, value in results.items():
        print(f"  {key}: {value:.4f}") # Format output

    print("\nGetting predictions for detailed classification report on the test set...")
    predictions_output = trainer.predict(test_dataset)

    predicted_indices = predictions_output.predictions.argmax(-1) # Get the index of the highest logit
    # Convert numerical predictions back to string labels using the ID_TO_LABEL map
    predicted_labels_text = [ID_TO_LABEL[idx] for idx in predicted_indices]

    # Get true string labels from your test data (e.g., from test_labels_str Series)
    true_labels_text = test_labels_str.tolist()

    print("\nClassification Report on Test Set (Corrected Order):")
    # Add the 'labels' parameter to ensure the report order matches target_names
    print(classification_report(
        true_labels_text,
        predicted_labels_text,
        labels=LABELS,
        target_names=LABELS,
        digits=4
    ))
else:
    print("Cannot evaluate on test set: Test dataset is empty or trainer was not initialized.")

print("\nScript finished.")


Evaluating on the test set...


  return forward_call(*args, **kwargs)


Test set evaluation results (from trainer.evaluate):
  eval_loss: 0.6807
  eval_accuracy: 0.7495
  eval_balanced_accuracy: 0.6940
  eval_f1_weighted: 0.7484
  eval_f1_macro: 0.6997
  eval_precision_weighted: 0.7477
  eval_recall_weighted: 0.7495
  eval_precision_macro: 0.7063
  eval_recall_macro: 0.6940
  eval_f1_phan_dong: 0.5811
  eval_f1_khong_phan_dong: 0.7026
  eval_f1_khong_lien_quan: 0.8155
  eval_recall_phan_dong: 0.5556
  eval_recall_khong_phan_dong: 0.7052
  eval_recall_khong_lien_quan: 0.8212
  eval_runtime: 15.7375
  eval_samples_per_second: 120.2220
  eval_steps_per_second: 5.0200
  epoch: 6.0000

Getting predictions for detailed classification report on the test set...


  return forward_call(*args, **kwargs)



Classification Report on Test Set (Corrected Order):
                 precision    recall  f1-score   support

      PHAN_DONG     0.6091    0.5556    0.5811       216
KHONG_PHAN_DONG     0.7000    0.7052    0.7026       675
KHONG_LIEN_QUAN     0.8099    0.8212    0.8155      1001

       accuracy                         0.7495      1892
      macro avg     0.7063    0.6940    0.6997      1892
   weighted avg     0.7477    0.7495    0.7484      1892


Script finished.


In [9]:
torch.cuda.empty_cache()

In [10]:
    # Save the fine-tuned model and tokenizer
    print("Saving model and tokenizer...")
    model.save_pretrained(OUTPUT_DIR + "/best_model")
    tokenizer.save_pretrained(OUTPUT_DIR + "/best_model")
    print(f"Model and tokenizer saved to {OUTPUT_DIR}/best_model")

Saving model and tokenizer...
Model and tokenizer saved to /content/drive/MyDrive/ChongPha_Ver2/VisoBERT/OUTPUT/best_model
