# 🔍 vELECTRA - Vietnamese Fake News Detection

| Item | Detail |
|---|---|
| **Model** | `FPTAI/velectra-base-discriminator-cased` |
| **Data** | `train.csv` (columns: `post_message`, `label`) |
| **Split** | 80% Train / 10% Val / 10% Test (stratified) |
| **Labels** | `0` = Reliable · `1` = Unreliable |
| **Preprocessing** | Emoji → emotion tokens, URL/email/phone removal |

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1. Setup & Install

In [2]:
!pip install -q transformers scikit-learn tqdm

In [3]:
import os, re, random, copy, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tqdm.auto import tqdm

print("✅ All packages imported successfully")

✅ All packages imported successfully


## 2. Mount Google Drive & Check GPU

In [4]:

# Check GPU
if torch.cuda.is_available():
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
else:
    print("⚠️  No GPU found! Go to Runtime → Change runtime type → GPU")

✅ GPU: Tesla T4


AttributeError: 'torch._C._CudaDeviceProperties' object has no attribute 'total_mem'

## 3. Config

⚠️ **Quan trọng**: Sửa `data_path` trỏ đến file `train.csv` của bạn.

In [6]:
CONFIG = {
    # =====================================================================
    # 👇 SỬA ĐƯỜNG DẪN FILE TRAIN.CSV CỦA BẠN Ở ĐÂY 👇
    # =====================================================================
    # Nếu upload trực tiếp lên Colab:
    "data_path": "/content/drive/MyDrive/PROJECT_FakeNewsDetection-KLKS/vELECTRA/model/data_final/train.csv",
    "output_dir": "/content/drive/MyDrive/PROJECT_FakeNewsDetection-KLKS/vELECTRA/model",
    # =====================================================================

    # --- model ---
    "model_name": "FPTAI/velectra-base-discriminator-cased",
    "max_length": 256,
    "num_labels": 2,

    # --- training ---
    "batch_size": 16,
    "learning_rate": 2e-5,
    "epochs": 5,
    "warmup_ratio": 0.1,
    "weight_decay": 0.01,
    "dropout": 0.2,
    "grad_clip": 1.0,

    # --- freezing ---
    "freeze_layers": 8,
    "freeze_embeddings": True,

    # --- split ---
    "test_size": 0.10,
    "val_size": 0.10,
    "seed": 42,

    # --- device ---
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}

print(f"📁 Data path  : {CONFIG['data_path']}")
print(f"💾 Output dir : {CONFIG['output_dir']}")
print(f"🖥️  Device     : {CONFIG['device']}")

📁 Data path  : /content/drive/MyDrive/PROJECT_FakeNewsDetection-KLKS/vELECTRA/model/data_final/train.csv
💾 Output dir : /content/drive/MyDrive/PROJECT_FakeNewsDetection-KLKS/vELECTRA/model
🖥️  Device     : cuda


## 4. Seed

In [7]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(CONFIG["seed"])
print(f"✅ Seed set to {CONFIG['seed']}")

✅ Seed set to 42


## 5. Emoji → Emotion Token Mapping

In [8]:
EMOJI_MAP = {
    # --- Vui / Tích cực ---
    "😀": " [VUI] ", "😃": " [VUI] ", "😄": " [VUI] ", "😁": " [VUI] ",
    "😆": " [VUI] ", "😂": " [CƯỜI] ", "🤣": " [CƯỜI] ", "😊": " [VUI] ",
    "😇": " [VUI] ", "🙂": " [VUI] ", "😉": " [TINH_NGHỊCH] ",
    "😍": " [YÊU] ", "🥰": " [YÊU] ", "😘": " [YÊU] ", "😗": " [YÊU] ",
    "😙": " [YÊU] ", "😚": " [YÊU] ", "🥲": " [CẢM_ĐỘNG] ",

    # --- Buồn ---
    "😢": " [BUỒN] ", "😭": " [KHÓC] ", "😞": " [BUỒN] ", "😔": " [BUỒN] ",
    "😟": " [LO_LẮNG] ", "🥺": " [BUỒN] ", "😿": " [BUỒN] ",

    # --- Giận / Tiêu cực ---
    "😠": " [GIẬN] ", "😡": " [GIẬN] ", "🤬": " [GIẬN] ", "😤": " [BỰC] ",
    "👿": " [GIẬN] ", "💢": " [GIẬN] ",

    # --- Ngạc nhiên / Sốc ---
    "😱": " [SỐC] ", "😨": " [SỢ] ", "😰": " [SỢ] ", "😲": " [NGẠC_NHIÊN] ",
    "😮": " [NGẠC_NHIÊN] ", "🤯": " [SỐC] ", "😳": " [NGẠC_NHIÊN] ",

    # --- Sợ ---
    "😧": " [SỢ] ", "😦": " [SỢ] ", "😥": " [SỢ] ",

    # --- Suy nghĩ / Nghi ngờ ---
    "🤔": " [SUY_NGHĨ] ", "🧐": " [SUY_NGHĨ] ", "🤨": " [NGHI_NGỜ] ",

    # --- Cử chỉ ---
    "👍": " [ĐỒNG_Ý] ", "👎": " [PHẢN_ĐỐI] ", "👏": " [KHEN] ",
    "🙏": " [CẦU_NGUYỆN] ", "✌️": " [CHIẾN_THẮNG] ",
    "💪": " [MẠNH_MẼ] ", "🤝": " [BẮT_TAY] ",

    # --- Biểu tượng ---
    "❤️": " [YÊU] ", "💔": " [BUỒN] ", "🔥": " [NÓNG] ", "💯": " [HOÀN_HẢO] ",
    "⚠️": " [CẢNH_BÁO] ", "❌": " [SAI] ", "✅": " [ĐÚNG] ",
    "🚨": " [KHẨN_CẤP] ", "📢": " [THÔNG_BÁO] ", "📌": " [GHI_CHÚ] ",
    "⭐": " [NGÔI_SAO] ", "🌟": " [NGÔI_SAO] ", "💀": " [NGUY_HIỂM] ",
    "🤡": " [HỀ] ", "🐍": " [XẤU_XA] ",

    # --- Cờ / Quốc gia ---
    "🇻🇳": " [VIỆT_NAM] ",

    # --- Misc ---
    "😷": " [BỆNH] ", "🤒": " [BỆNH] ", "🤧": " [BỆNH] ",
    "💉": " [TIÊM_CHỦNG] ", "🦠": " [VIRUS] ", "😴": " [NGỦ] ",
    "🤫": " [IM_LẶNG] ", "🤥": " [NÓI_DỐI] ",
}

# Regex bắt emoji Unicode còn sót
_EMOJI_RE = re.compile(
    "["
    "\U0001F600-\U0001F64F"
    "\U0001F300-\U0001F5FF"
    "\U0001F680-\U0001F6FF"
    "\U0001F1E0-\U0001F1FF"
    "\U00002702-\U000027B0"
    "\U000024C2-\U0001F251"
    "\U0001F900-\U0001F9FF"
    "\U0001FA00-\U0001FA6F"
    "\U0001FA70-\U0001FAFF"
    "\U00002600-\U000026FF"
    "\U0000FE00-\U0000FE0F"
    "\U0000200D"
    "\U00002B50"
    "]+"
)

def convert_emoji(text: str) -> str:
    """Chuyển emoji/icon thành token cảm xúc tiếng Việt."""
    for emoji_char, token in EMOJI_MAP.items():
        text = text.replace(emoji_char, token)
    text = _EMOJI_RE.sub(" [EMOJI] ", text)
    return text

# Demo
print(convert_emoji("Tin sốt 😱🔥 COVID lây qua 5G 🤥❌"))
print(f"✅ {len(EMOJI_MAP)} emoji mappings loaded")

Tin sốt  [SỐC]  [NÓNG]  COVID lây qua 5G  [NÓI_DỐI]  [SAI] 
✅ 75 emoji mappings loaded


## 6. Text Cleaning

In [9]:
def clean_text(text: str) -> str:
    """Tiền xử lý tối thiểu, giữ đặc trưng ngôn ngữ."""
    if pd.isna(text):
        return ""
    text = str(text)

    # 1. Loại HTML tags
    text = re.sub(r"<[^>]+>", " ", text)

    # 2. Loại URLs
    text = re.sub(r"http[s]?://\S+", " [URL] ", text)
    text = re.sub(r"www\.\S+", " [URL] ", text)

    # 3. Loại email
    text = re.sub(r"\S+@\S+", " [EMAIL] ", text)

    # 4. Loại số điện thoại
    text = re.sub(r"(\+84|0)\d{1,3}[-.\s]?\d{3,4}[-.\s]?\d{3,4}", " [PHONE] ", text)

    # 5. Convert emoji → token cảm xúc
    text = convert_emoji(text)

    # 6. Chuẩn hoá khoảng trắng
    text = re.sub(r"\s+", " ", text).strip()

    return text

def is_valid(text: str) -> bool:
    return isinstance(text, str) and len(text.split()) >= 5

print("✅ Text cleaning functions defined")

✅ Text cleaning functions defined


## 7. Load & Split Data (80/10/10)

In [10]:
def load_and_split(config: dict):
    print("\n" + "=" * 80)
    print("📂 LOADING DATA")
    print("=" * 80)

    data_path = config["data_path"]
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"❌ Not found: {data_path}")

    df = pd.read_csv(data_path)
    print(f"Raw rows: {len(df)}")

    # Rename columns
    if "post_message" in df.columns:
        df.rename(columns={"post_message": "text"}, inplace=True)
    if "text" not in df.columns:
        raise ValueError("❌ Cannot find text column (post_message)")
    if "label" not in df.columns:
        raise ValueError("❌ Cannot find label column")

    df = df[["text", "label"]].dropna()
    df["label"] = df["label"].astype(int)

    # Validate labels
    bad = df[~df["label"].isin([0, 1])]
    if len(bad) > 0:
        raise ValueError(f"❌ Found labels not in {{0,1}}. Examples:\n{bad.head()}")

    # Clean text
    print("🧹 Cleaning text + converting emoji → emotion tokens ...")
    df["text_clean"] = df["text"].apply(clean_text)
    df = df[df["text_clean"].apply(is_valid)].copy()

    # Dedup
    before = len(df)
    df = df.drop_duplicates(subset=["text_clean"], keep="first").reset_index(drop=True)
    print(f"✅ After clean + dedup: {len(df)} (removed {before - len(df)} duplicates)")

    n = len(df)
    c0 = int((df["label"] == 0).sum())
    c1 = int((df["label"] == 1).sum())
    print(f"✅ Total: {n} | Reliable(0)={c0} ({c0/n:.1%}) | Unreliable(1)={c1} ({c1/n:.1%})")

    # ----- Stratified split: 80 / 10 / 10 -----
    print("\n" + "=" * 80)
    print("✂️  SPLITTING DATA: 80% Train / 10% Val / 10% Test (stratified)")
    print("=" * 80)

    seed = config["seed"]

    # Step 1: 80% train + 20% temp
    train_df, temp_df = train_test_split(
        df, test_size=(config["test_size"] + config["val_size"]),
        stratify=df["label"], random_state=seed,
    )

    # Step 2: 50/50 temp → val + test
    val_df, test_df = train_test_split(
        temp_df, test_size=0.5,
        stratify=temp_df["label"], random_state=seed,
    )

    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    def _print_split(name, d):
        n = len(d)
        c0 = int((d["label"] == 0).sum())
        c1 = int((d["label"] == 1).sum())
        print(f"  {name:6s}: {n:>6d} | Reliable={c0} ({c0/n:.1%}) | Unreliable={c1} ({c1/n:.1%})")

    _print_split("Train", train_df)
    _print_split("Val", val_df)
    _print_split("Test", test_df)

    return train_df, val_df, test_df

train_df, val_df, test_df = load_and_split(CONFIG)


📂 LOADING DATA
Raw rows: 4372
🧹 Cleaning text + converting emoji → emotion tokens ...
✅ After clean + dedup: 4265 (removed 30 duplicates)
✅ Total: 4265 | Reliable(0)=3544 (83.1%) | Unreliable(1)=721 (16.9%)

✂️  SPLITTING DATA: 80% Train / 10% Val / 10% Test (stratified)
  Train :   3412 | Reliable=2835 (83.1%) | Unreliable=577 (16.9%)
  Val   :    426 | Reliable=354 (83.1%) | Unreliable=72 (16.9%)
  Test  :    427 | Reliable=355 (83.1%) | Unreliable=72 (16.9%)


## 8. Dataset & DataLoader

In [11]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length: int):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tok = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tok(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "label": torch.tensor(int(self.labels[idx]), dtype=torch.long),
        }

print("✅ NewsDataset class defined")

✅ NewsDataset class defined


## 9. Helper Functions

In [12]:
def compute_class_weights(labels, device):
    counts = np.bincount(labels, minlength=2)
    weights = counts.sum() / (2.0 * np.maximum(counts, 1))
    return counts, torch.tensor(weights, dtype=torch.float32, device=device)


def freeze_backbone(model, freeze_layers: int, freeze_embeddings: bool):
    if freeze_layers <= 0 and not freeze_embeddings:
        return
    if hasattr(model, "electra"):
        if freeze_embeddings and hasattr(model.electra, "embeddings"):
            for p in model.electra.embeddings.parameters():
                p.requires_grad = False
        if hasattr(model.electra, "encoder"):
            for i, layer in enumerate(model.electra.encoder.layer):
                if i < freeze_layers:
                    for p in layer.parameters():
                        p.requires_grad = False
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"🧊 Frozen: {total - trainable:,} / {total:,} params  |  Trainable: {trainable:,}")


@torch.no_grad()
def evaluate(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0.0
    criterion = nn.CrossEntropyLoss()
    for batch in dataloader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        out = model(input_ids=ids, attention_mask=mask)
        loss = criterion(out.logits, labels)
        total_loss += loss.item()
        preds = out.logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.cpu().numpy().tolist())
    avg_loss = total_loss / max(len(dataloader), 1)
    acc = accuracy_score(all_labels, all_preds)
    f1m = f1_score(all_labels, all_preds, average="macro")
    return avg_loss, acc, f1m, np.array(all_labels), np.array(all_preds)

print("✅ Helper functions defined")

✅ Helper functions defined


## 10. 🚀 Training

In [13]:
device = CONFIG["device"]

# --- Tokenizer ---
print("\n" + "=" * 80)
print(f"🤖 MODEL: {CONFIG['model_name']}")
print("=" * 80)

tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"], use_fast=False)

train_loader = DataLoader(
    NewsDataset(train_df["text_clean"].values, train_df["label"].values,
                tokenizer, CONFIG["max_length"]),
    batch_size=CONFIG["batch_size"], shuffle=True,
)
val_loader = DataLoader(
    NewsDataset(val_df["text_clean"].values, val_df["label"].values,
                tokenizer, CONFIG["max_length"]),
    batch_size=CONFIG["batch_size"], shuffle=False,
)
test_loader = DataLoader(
    NewsDataset(test_df["text_clean"].values, test_df["label"].values,
                tokenizer, CONFIG["max_length"]),
    batch_size=CONFIG["batch_size"], shuffle=False,
)

# --- Model ---
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG["model_name"], num_labels=CONFIG["num_labels"],
).to(device)

try:
    model.resize_token_embeddings(len(tokenizer))
except Exception:
    pass

try:
    model.config.hidden_dropout_prob = CONFIG["dropout"]
    model.config.attention_probs_dropout_prob = CONFIG["dropout"]
except Exception:
    pass

freeze_backbone(model, CONFIG["freeze_layers"], CONFIG["freeze_embeddings"])

# --- Class weights ---
counts, class_w = compute_class_weights(
    train_df["label"].values.astype(int), device,
)
print(f"📊 Class counts [Reliable, Unreliable]: {counts}")
print(f"   Class weights: {class_w.cpu().numpy()}")
criterion = nn.CrossEntropyLoss(weight=class_w)

# --- Optimizer & Scheduler ---
optimizer = AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=CONFIG["learning_rate"],
    weight_decay=CONFIG["weight_decay"],
)
total_steps = len(train_loader) * CONFIG["epochs"]
warmup_steps = int(total_steps * CONFIG["warmup_ratio"])
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps,
)

# --- Training Loop ---
print("\n" + "=" * 80)
print("🚀 TRAINING")
print("=" * 80)
print(f"   Device     : {device}")
print(f"   Epochs     : {CONFIG['epochs']}")
print(f"   Batch size : {CONFIG['batch_size']}")
print(f"   LR         : {CONFIG['learning_rate']}")
print(f"   Max length : {CONFIG['max_length']}")
print(f"   Total steps: {total_steps}  (warmup: {warmup_steps})")
print()

best_state = None
best_f1 = -1.0

for epoch in range(1, CONFIG["epochs"] + 1):
    model.train()
    running_loss = 0.0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{CONFIG['epochs']}", leave=True)
    for batch in pbar:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad(set_to_none=True)
        out = model(input_ids=ids, attention_mask=mask)
        loss = criterion(out.logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG["grad_clip"])
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()
        pbar.set_postfix(loss=f"{loss.item():.4f}")

    train_loss = running_loss / max(len(train_loader), 1)

    # --- Validation ---
    val_loss, val_acc, val_f1, _, _ = evaluate(model, val_loader, device)

    marker = ""
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_state = copy.deepcopy(model.state_dict())
        marker = " ✅ best"

    print(
        f"  → train_loss={train_loss:.4f}  |  "
        f"val_loss={val_loss:.4f}  val_acc={val_acc*100:.2f}%  "
        f"val_macroF1={val_f1:.4f}{marker}"
    )

# Load best model
if best_state is not None:
    model.load_state_dict(best_state)
    print(f"\n🏆 Best Val Macro-F1: {best_f1:.4f}")


🤖 MODEL: FPTAI/velectra-base-discriminator-cased


config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

ElectraForSequenceClassification LOAD REPORT from: FPTAI/velectra-base-discriminator-cased
Key                                               | Status     | 
--------------------------------------------------+------------+-
discriminator_predictions.dense_prediction.weight | UNEXPECTED | 
discriminator_predictions.dense.weight            | UNEXPECTED | 
discriminator_predictions.dense.bias              | UNEXPECTED | 
discriminator_predictions.dense_prediction.bias   | UNEXPECTED | 
classifier.out_proj.bias                          | MISSING    | 
classifier.dense.bias                             | MISSING    | 
classifier.dense.weight                           | MISSING    | 
classifier.out_proj.weight                        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


🧊 Frozen: 81,716,736 / 110,660,354 params  |  Trainable: 28,943,618
📊 Class counts [Reliable, Unreliable]: [2835  577]
   Class weights: [0.60176367 2.9566724 ]

🚀 TRAINING
   Device     : cuda
   Epochs     : 5
   Batch size : 16
   LR         : 2e-05
   Max length : 256
   Total steps: 1070  (warmup: 107)



Epoch 1/5:   0%|          | 0/214 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

  → train_loss=0.4997  |  val_loss=0.3008  val_acc=87.56%  val_macroF1=0.8026 ✅ best


Epoch 2/5:   0%|          | 0/214 [00:00<?, ?it/s]

  → train_loss=0.3265  |  val_loss=0.2799  val_acc=90.61%  val_macroF1=0.8271 ✅ best


Epoch 3/5:   0%|          | 0/214 [00:00<?, ?it/s]

  → train_loss=0.2429  |  val_loss=0.2798  val_acc=91.08%  val_macroF1=0.8357 ✅ best


Epoch 4/5:   0%|          | 0/214 [00:00<?, ?it/s]

  → train_loss=0.1766  |  val_loss=0.3414  val_acc=90.85%  val_macroF1=0.8196


Epoch 5/5:   0%|          | 0/214 [00:00<?, ?it/s]

  → train_loss=0.1369  |  val_loss=0.3401  val_acc=91.55%  val_macroF1=0.8366 ✅ best

🏆 Best Val Macro-F1: 0.8366


## 11. 🎯 Evaluation on Test Set

In [14]:
print("\n" + "=" * 80)
print("🎯 FINAL EVALUATION ON TEST SET")
print("=" * 80)

test_loss, test_acc, test_f1, y_true, y_pred = evaluate(model, test_loader, device)

f1_reliable = f1_score(y_true, y_pred, pos_label=0, average="binary")
f1_unreliable = f1_score(y_true, y_pred, pos_label=1, average="binary")

print(f"  Test Loss       : {test_loss:.4f}")
print(f"  Test Accuracy   : {test_acc*100:.2f}%")
print(f"  Macro-F1        : {test_f1:.4f}")
print(f"  F1 Reliable(0)  : {f1_reliable:.4f}")
print(f"  F1 Unreliable(1): {f1_unreliable:.4f}")
print(f"  Gap             : {abs(f1_reliable - f1_unreliable):.4f}")
print()
print(classification_report(
    y_true, y_pred,
    target_names=["Reliable (0)", "Unreliable (1)"],
    digits=4,
))


🎯 FINAL EVALUATION ON TEST SET
  Test Loss       : 0.3518
  Test Accuracy   : 90.16%
  Macro-F1        : 0.8095
  F1 Reliable(0)  : 0.9420
  F1 Unreliable(1): 0.6769
  Gap             : 0.2651

                precision    recall  f1-score   support

  Reliable (0)     0.9241    0.9606    0.9420       355
Unreliable (1)     0.7586    0.6111    0.6769        72

      accuracy                         0.9016       427
     macro avg     0.8414    0.7858    0.8095       427
  weighted avg     0.8962    0.9016    0.8973       427



## 12. 💾 Save Model

In [15]:
output_dir = CONFIG["output_dir"]
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("=" * 80)
print(f"💾 Model saved to: {output_dir}")
print("=" * 80)
print()
print("Files saved:")
for f in os.listdir(output_dir):
    size = os.path.getsize(os.path.join(output_dir, f))
    print(f"  📄 {f} ({size/1e6:.1f} MB)")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

💾 Model saved to: /content/drive/MyDrive/PROJECT_FakeNewsDetection-KLKS/vELECTRA/model

Files saved:
  📄 train_velectra_colab.ipynb (0.1 MB)
  📄 convert_parquet.py (0.0 MB)
  📄 process_dataset.py (0.0 MB)
  📄 train_velectra.py (0.0 MB)
  📄 data_final (0.0 MB)
  📄 config.json (0.0 MB)
  📄 model.safetensors (442.7 MB)
  📄 tokenizer_config.json (0.0 MB)
  📄 tokenizer.json (0.7 MB)


## 13. Sử dụng model đã train để test

In [18]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

LABEL_MAP = {0: "reliable", 1: "fake"}

device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_PATH = "/content/drive/MyDrive/PROJECT_FakeNewsDetection-KLKS/vELECTRA/model"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

model.to(device)
model.eval()

def predict(text, max_length=256):

    inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)

    label = torch.argmax(probs, dim=-1).item()

    return {
        "label_id": label,
        "label": LABEL_MAP[label],
        "prob_reliable": probs[0][0].item(),
        "prob_fake": probs[0][1].item()
    }

# ===============================
# NHẬP TỪ BÀN PHÍM
# ===============================

while True:
    text = input("\nNhập nội dung cần kiểm tra (q để thoát): ").strip()

    if text.lower() in ["q", "quit", "exit"]:
        print("Đã thoát.")
        break

    if len(text) == 0:
        print("⚠️ Nội dung trống, nhập lại.")
        continue

    result = predict(text)

    print(
        f"→ KẾT QUẢ: {result['label'].upper()} | "
        f"P(reliable)={result['prob_reliable']:.4f} | "
        f"P(fake)={result['prob_fake']:.4f}"
    )


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]


Nhập nội dung cần kiểm tra (q để thoát): Có thể sau những biến cố và thăng trầm cuộc sống, “hoa hậu đông con nhất Việt Nam” đang lựa chọn một cuộc sống an yên, kín tiếng. Hồ Oanh Yến sinh năm 1986 tại Vũng Tàu, từng hoạt động với vai trò người mẫu trước khi thử sức tại các đấu trường nhan sắc. Năm 2015, cô đăng quang Hoa hậu Thế giới Toàn cầu tại Philippines. Đến năm 2019, người đẹp tiếp tục giành danh hiệu Nữ hoàng Sắc đẹp Thế giới. Bên cạnh các danh hiệu sắc đẹp, Oanh Yến còn gây chú ý khi trải qua 6 lần sinh nở và được công chúng gọi là “hoa hậu đông con nhất Việt Nam”. Cũng trong năm 2019, cô từng khiến nhiều người ngưỡng mộ khi được chồng là doanh nhân tặng căn biệt thự trị giá khoảng 100 tỷ đồng tại Bình Dương. Tuy nhiên, biến cố ập đến khi Oanh Yến tiết lộ bị bạn bè phản bội, chơi xấu dẫn đến cảnh tan gia bại sản, phải bán hết tài sản. Năm 2020, cô rời TP.HCM, đưa gia đình về Đồng Nai xây dựng trang trại rộng hơn 10 ha, hướng đến mô hình nông nghiệp tự cung tự cấp. Gia đình hạn