In [None]:

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import sys, subprocess, warnings
warnings.filterwarnings("ignore")

def ensure_pkg(import_name, pip_name=None):
    if pip_name is None:
        pip_name = import_name
    try:
        __import__(import_name)
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pip_name])

print("📦 Installing packages...")
for pkg in [
    ("transformers", "transformers"),
    ("sklearn", "scikit-learn"),
    ("tqdm", "tqdm"),
    ("pandas", "pandas"),
    ("numpy", "numpy"),
    ("torch", "torch"),
]:
    ensure_pkg(pkg[0], pkg[1])

import re, random, copy
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from tqdm.auto import tqdm

# ============================================================
# CONFIG
# ============================================================

CONFIG = {
    "data_path": "/content/sample_data/fake_news_dataset.csv",  # đổi file nếu cần

    "electra_name": "FPTAI/velectra-base-discriminator-cased",
    "phobert_name": "vinai/phobert-base",

    # SAFE lengths
    "max_length_electra": 256,
    "max_length_phobert": 256,

    "batch_size_electra": 16,
    "batch_size_phobert": 8,

    "learning_rate": 1e-5,
    "epochs_electra": 5,
    "epochs_phobert": 5,
    "warmup_steps": 100,
    "weight_decay": 0.01,
    "dropout": 0.2,

    "freeze_electra_layers": 8,
    "freeze_electra_embeddings": True,
    "freeze_phobert_layers": 0,
    "freeze_phobert_embeddings": False,

    # Ensemble
    "w_grid": np.linspace(0.05, 0.95, 19),
    "thr_grid": np.linspace(0.20, 0.80, 61),
    "use_override": True,
    "override_conf": 0.92,

    # Leak/Near-dup settings
    # char ngram cosine >= threshold => same cluster
    "near_dup_threshold": 0.92,
    # neighbors to check per sample (bigger -> more robust, slower)
    "near_dup_k": 20,
    # TFIDF char settings
    "char_ngram_range": (4, 6),
    "min_df": 2,

    # Split ratios (approx)
    "test_ratio": 0.10,
    "val_ratio_of_trainval": 0.11,  # same as your old code

    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    "seed": 42,
}

print("="*90)
print("🚀 VIETNAMESE FAKE NEWS - ENSEMBLE (LEAK-SAFE SPLIT)")
print("="*90)
print(f"🖥️ Device: {CONFIG['device']}")
print("🧾 Label mapping: 0=REAL, 1=FAKE")

# ============================================================
# SEED
# ============================================================

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        try:
            torch.cuda.manual_seed_all(seed)
        except Exception:
            pass

set_seed(CONFIG["seed"])

# ============================================================
# CLEAN TEXT
# ============================================================

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"http[s]?://\S+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def is_valid(text):
    return len(text.split()) >= 8

# ============================================================
# Union-Find for clustering near-duplicates
# ============================================================

class UnionFind:
    def __init__(self, n):
        self.p = list(range(n))
        self.r = [0]*n

    def find(self, x):
        while self.p[x] != x:
            self.p[x] = self.p[self.p[x]]
            x = self.p[x]
        return x

    def union(self, a, b):
        ra, rb = self.find(a), self.find(b)
        if ra == rb:
            return
        if self.r[ra] < self.r[rb]:
            self.p[ra] = rb
        elif self.r[ra] > self.r[rb]:
            self.p[rb] = ra
        else:
            self.p[rb] = ra
            self.r[ra] += 1

def build_near_dup_groups(texts, threshold=0.92, k=20, ngram_range=(4,6), min_df=2):
    """
    Cluster near-duplicates using TFIDF char-ngram cosine similarity.
    Return group_id array length n.
    """
    n = len(texts)
    if n == 0:
        return np.array([], dtype=int)

    vec = TfidfVectorizer(
        analyzer="char_wb",
        ngram_range=ngram_range,
        min_df=min_df,
        dtype=np.float32
    )
    X = vec.fit_transform(texts)

    # Nearest neighbors with cosine distance => sim = 1 - dist
    nn = NearestNeighbors(
        n_neighbors=min(k, n),
        metric="cosine",
        algorithm="brute",
        n_jobs=-1
    )
    nn.fit(X)
    dists, idxs = nn.kneighbors(X, return_distance=True)

    uf = UnionFind(n)
    # union if similarity >= threshold
    for i in range(n):
        for dist, j in zip(dists[i], idxs[i]):
            if j == i:
                continue
            sim = 1.0 - float(dist)
            if sim >= threshold:
                uf.union(i, int(j))

    roots = np.array([uf.find(i) for i in range(n)], dtype=int)

    # compress to consecutive ids
    uniq = {}
    gid = np.zeros(n, dtype=int)
    c = 0
    for i, r in enumerate(roots):
        if r not in uniq:
            uniq[r] = c
            c += 1
        gid[i] = uniq[r]
    return gid

def report_leak_exact(a_texts, b_texts, name):
    sa = set(a_texts)
    sb = set(b_texts)
    inter = len(sa & sb)
    print(f"Leak exact {name}: {inter}")
    return inter

def report_leak_near(a_texts, b_texts, threshold=0.92, ngram_range=(4,6), min_df=2):
    """
    Quick near-dup leak report: max cosine between B and A (TFIDF char).
    Not perfect but good signal.
    """
    if len(a_texts)==0 or len(b_texts)==0:
        return {"mean": 0, "median": 0, "p95": 0, "max": 0}

    vec = TfidfVectorizer(analyzer="char_wb", ngram_range=ngram_range, min_df=min_df, dtype=np.float32)
    X_a = vec.fit_transform(a_texts)
    X_b = vec.transform(b_texts)

    # cosine similarity = 1 - cosine distance
    nn = NearestNeighbors(n_neighbors=1, metric="cosine", algorithm="brute", n_jobs=-1).fit(X_a)
    dists, _ = nn.kneighbors(X_b, return_distance=True)
    sims = 1.0 - dists.reshape(-1)

    stats = {
        "mean": float(np.mean(sims)),
        "median": float(np.median(sims)),
        "p95": float(np.quantile(sims, 0.95)),
        "max": float(np.max(sims)),
        "count_ge_thr": int(np.sum(sims >= threshold))
    }
    return stats

# ============================================================
# LOAD DATA + CLEAN + EXACT DEDUP
# ============================================================

print("\n" + "="*90)
print("📂 LOADING DATA")
print("="*90)

if not os.path.exists(CONFIG["data_path"]):
    raise FileNotFoundError(f"❌ Not found: {CONFIG['data_path']}")

df = pd.read_csv(CONFIG["data_path"])

# detect columns
if "text" not in df.columns:
    for c in ["content", "article", "news", "body", "title"]:
        if c in df.columns:
            df["text"] = df[c]
            break
if "text" not in df.columns:
    raise ValueError("❌ Cannot find text column")

if "label" not in df.columns:
    for c in ["class", "category", "y"]:
        if c in df.columns:
            df["label"] = df[c]
            break
if "label" not in df.columns:
    raise ValueError("❌ Cannot find label column")

df = df[["text", "label"]].dropna()
df["label"] = df["label"].astype(int)

bad = df[~df["label"].isin([0, 1])]
if len(bad) > 0:
    raise ValueError(f"❌ Found labels not in {{0,1}}. Examples:\n{bad.head()}")

print("🧹 Cleaning + exact dedup...")
df["text_clean"] = df["text"].apply(clean_text)
df = df[df["text_clean"].apply(is_valid)].copy()
before = len(df)
df = df.drop_duplicates(subset=["text_clean"], keep="first").reset_index(drop=True)
print(f"✅ After exact dedup: {len(df)} (removed {before-len(df)})")

n = len(df)
c0 = int((df["label"]==0).sum())
c1 = int((df["label"]==1).sum())
print(f"✅ Final: {n} samples | REAL={c0} ({c0/n:.1%}) | FAKE={c1} ({c1/n:.1%})")

# ============================================================
# BUILD NEAR-DUP GROUPS (CLUSTERING)
# ============================================================

print("\n" + "="*90)
print("🧩 BUILDING NEAR-DUP CLUSTERS")
print("="*90)

groups = build_near_dup_groups(
    df["text_clean"].tolist(),
    threshold=float(CONFIG["near_dup_threshold"]),
    k=int(CONFIG["near_dup_k"]),
    ngram_range=tuple(CONFIG["char_ngram_range"]),
    min_df=int(CONFIG["min_df"])
)
df["group"] = groups

n_groups = int(df["group"].nunique())
group_sizes = df["group"].value_counts()
print(f"✅ Groups: {n_groups} | Largest group size: {int(group_sizes.max())}")
print(f"Top 5 group sizes:\n{group_sizes.head(5).to_string()}")

# ============================================================
# STRATIFIED GROUP SPLIT: TRAIN / VAL / TEST
# ============================================================

print("\n" + "="*90)
print("✂️ LEAK-SAFE SPLIT (STRATIFIED BY LABEL, GROUP-AWARE)")
print("="*90)

y = df["label"].values
g = df["group"].values

# Step 1: split out test (approx 10%) using StratifiedGroupKFold
# We do K folds and pick fold with closest test size.
kfold = 10
sgkf = StratifiedGroupKFold(n_splits=kfold, shuffle=True, random_state=CONFIG["seed"])

best_fold = None
best_diff = 1e9
test_target = float(CONFIG["test_ratio"])

splits = list(sgkf.split(df, y, groups=g))
for fold_i, (trainval_idx, test_idx) in enumerate(splits):
    ratio = len(test_idx)/len(df)
    diff = abs(ratio - test_target)
    if diff < best_diff:
        best_diff = diff
        best_fold = (trainval_idx, test_idx, ratio, fold_i)

trainval_idx, test_idx, ratio, fold_i = best_fold
print(f"Picked fold {fold_i} for TEST: size={len(test_idx)} ({ratio:.3f})")

df_trainval = df.iloc[trainval_idx].reset_index(drop=True)
df_test = df.iloc[test_idx].reset_index(drop=True)

# Step 2: split trainval into train/val with group-aware stratified fold
# val_ratio = 0.11 of trainval ~ like before
val_target = float(CONFIG["val_ratio_of_trainval"])
kfold2 = 9
sgkf2 = StratifiedGroupKFold(n_splits=kfold2, shuffle=True, random_state=CONFIG["seed"]+7)
splits2 = list(sgkf2.split(df_trainval, df_trainval["label"].values, groups=df_trainval["group"].values))

best2 = None
best2_diff = 1e9
for fold_i2, (train_idx, val_idx) in enumerate(splits2):
    ratio2 = len(val_idx)/len(df_trainval)
    diff2 = abs(ratio2 - val_target)
    if diff2 < best2_diff:
        best2_diff = diff2
        best2 = (train_idx, val_idx, ratio2, fold_i2)

train_idx, val_idx, ratio2, fold_i2 = best2
print(f"Picked fold {fold_i2} for VAL: size={len(val_idx)} ({ratio2:.3f})")

train_df = df_trainval.iloc[train_idx].reset_index(drop=True)
val_df = df_trainval.iloc[val_idx].reset_index(drop=True)
test_df = df_test.copy()

def dist_print(name, dfx):
    n = len(dfx)
    c0 = int((dfx["label"]==0).sum())
    c1 = int((dfx["label"]==1).sum())
    print(f"{name}: {n} | REAL={c0} ({c0/n:.1%}) | FAKE={c1} ({c1/n:.1%}) | groups={dfx['group'].nunique()}")

dist_print("Train", train_df)
dist_print("Val  ", val_df)
dist_print("Test ", test_df)

# hard check: no group overlap
overlap_tv = set(train_df["group"]) & set(val_df["group"])
overlap_tt = set(train_df["group"]) & set(test_df["group"])
overlap_vt = set(val_df["group"]) & set(test_df["group"])
print(f"\nGroup overlap Train∩Val={len(overlap_tv)} | Train∩Test={len(overlap_tt)} | Val∩Test={len(overlap_vt)}")

# ============================================================
# LEAK REPORT (EXACT + NEAR)
# ============================================================

print("\n" + "="*90)
print("🧪 LEAK REPORT")
print("="*90)

report_leak_exact(train_df["text_clean"], val_df["text_clean"], "Train∩Val")
report_leak_exact(train_df["text_clean"], test_df["text_clean"], "Train∩Test")
report_leak_exact(val_df["text_clean"], test_df["text_clean"], "Val∩Test")

stats_tv = report_leak_near(train_df["text_clean"].tolist(), val_df["text_clean"].tolist(),
                            threshold=CONFIG["near_dup_threshold"],
                            ngram_range=CONFIG["char_ngram_range"],
                            min_df=CONFIG["min_df"])
stats_tt = report_leak_near(train_df["text_clean"].tolist(), test_df["text_clean"].tolist(),
                            threshold=CONFIG["near_dup_threshold"],
                            ngram_range=CONFIG["char_ngram_range"],
                            min_df=CONFIG["min_df"])
stats_vt = report_leak_near(val_df["text_clean"].tolist(), test_df["text_clean"].tolist(),
                            threshold=CONFIG["near_dup_threshold"],
                            ngram_range=CONFIG["char_ngram_range"],
                            min_df=CONFIG["min_df"])

print("\nNear-dup cosine stats (char-ngram TFIDF):")
print("Val->Train:", stats_tv)
print("Test->Train:", stats_tt)
print("Test->Val:", stats_vt)

# ============================================================
# DATASET / DATALOADER
# ============================================================

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tok = tokenizer
        self.max_length = int(max_length)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])
        enc = self.tok(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids = enc["input_ids"].squeeze(0).to(torch.long)
        attn = enc["attention_mask"].squeeze(0).to(torch.long)

        # guard against len(tokenizer)
        mx = int(input_ids.max().item())
        if mx >= len(self.tok):
            raise ValueError(f"Bad token id: max={mx} >= len(tokenizer)={len(self.tok)}")

        return {
            "input_ids": input_ids,
            "attention_mask": attn,
            "label": torch.tensor(label, dtype=torch.long),
        }

def make_loaders(tokenizer, max_length, batch_size):
    train_loader = DataLoader(
        NewsDataset(train_df["text_clean"].values, train_df["label"].values, tokenizer, max_length),
        batch_size=int(batch_size), shuffle=True
    )
    val_loader = DataLoader(
        NewsDataset(val_df["text_clean"].values, val_df["label"].values, tokenizer, max_length),
        batch_size=int(batch_size), shuffle=False
    )
    test_loader = DataLoader(
        NewsDataset(test_df["text_clean"].values, test_df["label"].values, tokenizer, max_length),
        batch_size=int(batch_size), shuffle=False
    )
    return train_loader, val_loader, test_loader

def compute_class_weights(y):
    counts = np.bincount(y, minlength=2)
    weights = counts.sum() / (2.0 * np.maximum(counts, 1))
    return counts, torch.tensor(weights, dtype=torch.float32, device=CONFIG["device"])

def freeze_backbone(model, model_type: str, freeze_layers: int, freeze_embeddings: bool):
    if freeze_layers <= 0 and not freeze_embeddings:
        return
    if model_type == "electra":
        if freeze_embeddings and hasattr(model, "electra") and hasattr(model.electra, "embeddings"):
            for p in model.electra.embeddings.parameters():
                p.requires_grad = False
        if hasattr(model, "electra") and hasattr(model.electra, "encoder"):
            layers = model.electra.encoder.layer
            for i, layer in enumerate(layers):
                if i < freeze_layers:
                    for p in layer.parameters():
                        p.requires_grad = False
    else:
        base = model.roberta if hasattr(model, "roberta") else (model.bert if hasattr(model, "bert") else None)
        if base is None:
            return
        if freeze_embeddings and hasattr(base, "embeddings"):
            for p in base.embeddings.parameters():
                p.requires_grad = False
        if hasattr(base, "encoder") and hasattr(base.encoder, "layer"):
            layers = base.encoder.layer
            for i, layer in enumerate(layers):
                if i < freeze_layers:
                    for p in layer.parameters():
                        p.requires_grad = False

@torch.no_grad()
def infer_probs(model, dataloader):
    model.eval()
    p_fake_list, conf_list = [], []
    for batch in tqdm(dataloader, desc="Infer", leave=False):
        input_ids = batch["input_ids"].to(CONFIG["device"])
        attention_mask = batch["attention_mask"].to(CONFIG["device"])
        out = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(out.logits, dim=1)
        p_fake = probs[:, 1].detach().cpu().numpy()
        conf = np.maximum(p_fake, 1.0 - p_fake)
        p_fake_list.extend(p_fake.tolist())
        conf_list.extend(conf.tolist())
    return np.array(p_fake_list, dtype=np.float32), np.array(conf_list, dtype=np.float32)

def train_model(model_name, model_type, epochs, max_length, batch_size, freeze_layers, freeze_embeddings):
    print("\n" + "="*90)
    print(f"🤖 TRAINING: {model_name}")
    print("="*90)

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
    train_loader, val_loader, test_loader = make_loaders(tokenizer, max_length, batch_size)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(CONFIG["device"])
    try:
        model.resize_token_embeddings(len(tokenizer))
    except Exception:
        pass

    if "electra" in model_name.lower():
        try:
            model.config.hidden_dropout_prob = float(CONFIG["dropout"])
            model.config.attention_probs_dropout_prob = float(CONFIG["dropout"])
        except Exception:
            pass

    freeze_backbone(model, model_type, int(freeze_layers), bool(freeze_embeddings))

    y_train = train_df["label"].values.astype(int)
    counts, class_w = compute_class_weights(y_train)
    print("Class counts [REAL, FAKE]:", counts, "| class_weights:", class_w.detach().cpu().numpy())
    criterion = nn.CrossEntropyLoss(weight=class_w)

    optimizer = AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=float(CONFIG["learning_rate"]),
        weight_decay=float(CONFIG["weight_decay"]),
    )

    total_steps = max(len(train_loader) * int(epochs), 1)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(CONFIG["warmup_steps"]),
        num_training_steps=total_steps
    )

    best_state = None
    best_f1 = -1

    for ep in range(int(epochs)):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_loader, desc=f"Train ep{ep+1}", leave=False):
            input_ids = batch["input_ids"].to(CONFIG["device"])
            attention_mask = batch["attention_mask"].to(CONFIG["device"])
            labels = batch["label"].to(CONFIG["device"])

            optimizer.zero_grad(set_to_none=True)
            out = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(out.logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            total_loss += float(loss.item())

        p_val, _ = infer_probs(model, val_loader)
        pred_val = (p_val >= 0.5).astype(int)
        f1m = f1_score(val_df["label"].values.astype(int), pred_val, average="macro")
        acc = accuracy_score(val_df["label"].values.astype(int), pred_val)

        print(f"Epoch {ep+1}/{epochs} | loss={total_loss/max(len(train_loader),1):.4f} | val_acc={acc*100:.2f}% | val_macroF1={f1m:.4f}")

        if f1m > best_f1:
            best_f1 = f1m
            best_state = copy.deepcopy(model.state_dict())
            print("✅ New best")

    if best_state is not None:
        model.load_state_dict(best_state)

    p_val, c_val = infer_probs(model, val_loader)
    p_test, c_test = infer_probs(model, test_loader)

    return {
        "name": model_name,
        "tokenizer": tokenizer,
        "model": model,
        "p_val": p_val,
        "c_val": c_val,
        "p_test": p_test,
        "c_test": c_test,
        "max_length": int(max_length),
    }

# ============================================================
# TRAIN 2 MODELS
# ============================================================

electra_pack = train_model(
    model_name=CONFIG["electra_name"],
    model_type="electra",
    epochs=CONFIG["epochs_electra"],
    max_length=CONFIG["max_length_electra"],
    batch_size=CONFIG["batch_size_electra"],
    freeze_layers=CONFIG["freeze_electra_layers"],
    freeze_embeddings=CONFIG["freeze_electra_embeddings"],
)

phobert_pack = train_model(
    model_name=CONFIG["phobert_name"],
    model_type="roberta",
    epochs=CONFIG["epochs_phobert"],
    max_length=CONFIG["max_length_phobert"],
    batch_size=CONFIG["batch_size_phobert"],
    freeze_layers=CONFIG["freeze_phobert_layers"],
    freeze_embeddings=CONFIG["freeze_phobert_embeddings"],
)

# ============================================================
# ENSEMBLE TUNING ON VAL
# ============================================================

print("\n" + "="*90)
print("🧪 TUNING ENSEMBLE (w, thr) ON VAL")
print("="*90)

pE, cE = electra_pack["p_val"], electra_pack["c_val"]
pP, cP = phobert_pack["p_val"], phobert_pack["c_val"]
y_val = val_df["label"].values.astype(int)

use_override = bool(CONFIG["use_override"])
override_conf = float(CONFIG["override_conf"])

best = {"macro_f1": -1.0, "w": 0.5, "thr": 0.5}

for w in CONFIG["w_grid"]:
    p = w * pE + (1.0 - w) * pP

    if use_override:
        p2 = p.copy()
        mask = np.maximum(cE, cP) >= override_conf
        choose_e = (cE >= cP)
        p2[mask & choose_e] = pE[mask & choose_e]
        p2[mask & (~choose_e)] = pP[mask & (~choose_e)]
        p = p2

    for thr in CONFIG["thr_grid"]:
        pred = (p >= thr).astype(int)
        f1m = f1_score(y_val, pred, average="macro")
        if f1m > best["macro_f1"]:
            best = {"macro_f1": float(f1m), "w": float(w), "thr": float(thr)}

print(f"✅ Best VAL Macro-F1: {best['macro_f1']:.4f} | w_electra={best['w']:.2f} | thr={best['thr']:.2f}")

# ============================================================
# EVAL ON TEST
# ============================================================

print("\n" + "="*90)
print("🎯 FINAL EVALUATION ON TEST (LEAK-SAFE)")
print("="*90)

pE_t, cE_t = electra_pack["p_test"], electra_pack["c_test"]
pP_t, cP_t = phobert_pack["p_test"], phobert_pack["c_test"]
y_test = test_df["label"].values.astype(int)

w, thr = best["w"], best["thr"]
p_test = w * pE_t + (1.0 - w) * pP_t

if use_override:
    p2 = p_test.copy()
    mask = np.maximum(cE_t, cP_t) >= override_conf
    choose_e = (cE_t >= cP_t)
    p2[mask & choose_e] = pE_t[mask & choose_e]
    p2[mask & (~choose_e)] = pP_t[mask & (~choose_e)]
    p_test = p2

pred_test = (p_test >= thr).astype(int)

acc = accuracy_score(y_test, pred_test)
f1_fake = f1_score(y_test, pred_test, pos_label=1, average="binary")
f1_real = f1_score(y_test, pred_test, pos_label=0, average="binary")
macro = (f1_fake + f1_real)/2

print(f"Accuracy:    {acc*100:.2f}%")
print(f"Macro-F1:    {macro:.4f}")
print(f"F1 FAKE(1):  {f1_fake:.4f}")
print(f"F1 REAL(0):  {f1_real:.4f}")
print(f"Gap:         {abs(f1_fake - f1_real):.4f}")

print("\n📋 Classification Report:")
print(classification_report(y_test, pred_test, target_names=["REAL (0)", "FAKE (1)"], digits=4))


📦 Installing packages...
🚀 VIETNAMESE FAKE NEWS - ENSEMBLE (LEAK-SAFE SPLIT)
🖥️ Device: cuda
🧾 Label mapping: 0=REAL, 1=FAKE

📂 LOADING DATA
🧹 Cleaning + exact dedup...
✅ After exact dedup: 1843 (removed 0)
✅ Final: 1843 samples | REAL=974 (52.8%) | FAKE=869 (47.2%)

🧩 BUILDING NEAR-DUP CLUSTERS
✅ Groups: 1806 | Largest group size: 8
Top 5 group sizes:
group
35     8
136    7
26     4
704    2
147    2

✂️ LEAK-SAFE SPLIT (STRATIFIED BY LABEL, GROUP-AWARE)
Picked fold 2 for TEST: size=185 (0.100)
Picked fold 2 for VAL: size=182 (0.110)
Train: 1476 | REAL=776 (52.6%) | FAKE=700 (47.4%) | groups=1444
Val  : 182 | REAL=98 (53.8%) | FAKE=84 (46.2%) | groups=180
Test : 185 | REAL=100 (54.1%) | FAKE=85 (45.9%) | groups=182

Group overlap Train∩Val=0 | Train∩Test=0 | Val∩Test=0

🧪 LEAK REPORT
Leak exact Train∩Val: 0
Leak exact Train∩Test: 0
Leak exact Val∩Test: 0

Near-dup cosine stats (char-ngram TFIDF):
Val->Train: {'mean': 0.3505195379257202, 'median': 0.30619990825653076, 'p95': 0.6880510

config.json: 0.00B [00:00, ?B/s]



vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

ElectraForSequenceClassification LOAD REPORT from: FPTAI/velectra-base-discriminator-cased
Key                                               | Status     | 
--------------------------------------------------+------------+-
discriminator_predictions.dense.weight            | UNEXPECTED | 
discriminator_predictions.dense_prediction.weight | UNEXPECTED | 
discriminator_predictions.dense_prediction.bias   | UNEXPECTED | 
discriminator_predictions.dense.bias              | UNEXPECTED | 
classifier.dense.bias                             | MISSING    | 
classifier.dense.weight                           | MISSING    | 
classifier.out_proj.bias                          | MISSING    | 
classifier.out_proj.weight                        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Class counts [REAL, FAKE]: [776 700] | class_weights: [0.9510309 1.0542858]


Train ep1:   0%|          | 0/93 [00:00<?, ?it/s]

Infer:   0%|          | 0/12 [00:00<?, ?it/s]

Epoch 1/5 | loss=0.5200 | val_acc=95.60% | val_macroF1=0.9555
✅ New best


Train ep2:   0%|          | 0/93 [00:00<?, ?it/s]

Infer:   0%|          | 0/12 [00:00<?, ?it/s]

Epoch 2/5 | loss=0.1304 | val_acc=93.96% | val_macroF1=0.9386


Train ep3:   0%|          | 0/93 [00:00<?, ?it/s]

Infer:   0%|          | 0/12 [00:00<?, ?it/s]

Epoch 3/5 | loss=0.0672 | val_acc=96.70% | val_macroF1=0.9667
✅ New best


Train ep4:   0%|          | 0/93 [00:00<?, ?it/s]

Infer:   0%|          | 0/12 [00:00<?, ?it/s]

Epoch 4/5 | loss=0.0438 | val_acc=96.15% | val_macroF1=0.9612


Train ep5:   0%|          | 0/93 [00:00<?, ?it/s]

Infer:   0%|          | 0/12 [00:00<?, ?it/s]

Epoch 5/5 | loss=0.0309 | val_acc=96.70% | val_macroF1=0.9667


Infer:   0%|          | 0/12 [00:00<?, ?it/s]

Infer:   0%|          | 0/12 [00:00<?, ?it/s]


🤖 TRAINING: vinai/phobert-base


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: vinai/phobert-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.pooler.dense.weight     | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.pooler.dense.bias       | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.decoder.bias            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.decoder.weight          | UNEXPECTED | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initia

Class counts [REAL, FAKE]: [776 700] | class_weights: [0.9510309 1.0542858]


Train ep1:   0%|          | 0/185 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Infer:   0%|          | 0/23 [00:00<?, ?it/s]

Epoch 1/5 | loss=0.4404 | val_acc=97.25% | val_macroF1=0.9725
✅ New best


Train ep2:   0%|          | 0/185 [00:00<?, ?it/s]

Infer:   0%|          | 0/23 [00:00<?, ?it/s]

Epoch 2/5 | loss=0.1446 | val_acc=98.35% | val_macroF1=0.9834
✅ New best


Train ep3:   0%|          | 0/185 [00:00<?, ?it/s]

Infer:   0%|          | 0/23 [00:00<?, ?it/s]

Epoch 3/5 | loss=0.0748 | val_acc=95.60% | val_macroF1=0.9555


Train ep4:   0%|          | 0/185 [00:00<?, ?it/s]

Infer:   0%|          | 0/23 [00:00<?, ?it/s]

Epoch 4/5 | loss=0.0437 | val_acc=98.35% | val_macroF1=0.9834


Train ep5:   0%|          | 0/185 [00:00<?, ?it/s]

Infer:   0%|          | 0/23 [00:00<?, ?it/s]

Epoch 5/5 | loss=0.0352 | val_acc=98.35% | val_macroF1=0.9834


Infer:   0%|          | 0/23 [00:00<?, ?it/s]

Infer:   0%|          | 0/24 [00:00<?, ?it/s]


🧪 TUNING ENSEMBLE (w, thr) ON VAL
✅ Best VAL Macro-F1: 0.9779 | w_electra=0.05 | thr=0.20

🎯 FINAL EVALUATION ON TEST (LEAK-SAFE)
Accuracy:    97.84%
Macro-F1:    0.9782
F1 FAKE(1):  0.9762
F1 REAL(0):  0.9802
Gap:         0.0040

📋 Classification Report:
              precision    recall  f1-score   support

    REAL (0)     0.9706    0.9900    0.9802       100
    FAKE (1)     0.9880    0.9647    0.9762        85

    accuracy                         0.9784       185
   macro avg     0.9793    0.9774    0.9782       185
weighted avg     0.9786    0.9784    0.9784       185



In [None]:
import numpy as np
import torch
import torch.nn.functional as F

def predict_proba_one(pack, text: str):
    tok = pack["tokenizer"]
    model = pack["model"]
    max_length = pack["max_length"]

    enc = tok(
        text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    enc = {k: v.to(CONFIG["device"]) for k, v in enc.items()}

    model.eval()
    with torch.no_grad():
        out = model(**enc)
        probs = F.softmax(out.logits, dim=1)[0].detach().cpu().numpy()  # [P(real), P(fake)]
    p_real, p_fake = float(probs[0]), float(probs[1])
    conf = max(p_real, p_fake)
    return p_fake, p_real, conf

def ensemble_predict(text: str, use_override=False, override_conf=0.92):
    # model probs
    pE, rE, cE = predict_proba_one(electra_pack, text)
    pP, rP, cP = predict_proba_one(phobert_pack, text)

    w = float(best["w"])
    thr = float(best["thr"])

    p = w * pE + (1 - w) * pP

    if use_override and max(cE, cP) >= override_conf:
        # trust more confident model
        p = pE if cE >= cP else pP

    pred = 1 if p >= thr else 0
    conf_final = p if pred == 1 else (1 - p)

    return {
        "pred": pred,
        "p_fake_final": float(p),
        "conf_final": float(conf_final),
        "thr": thr,
        "w": w,
        "electra": {"p_fake": pE, "p_real": rE, "conf": cE},
        "phobert": {"p_fake": pP, "p_real": rP, "conf": cP},
    }

def random_test_samples(df_split, k=15, seed=42, use_override=False):
    rng = np.random.default_rng(seed)
    idxs = rng.choice(len(df_split), size=min(k, len(df_split)), replace=False)

    correct = 0
    for t, idx in enumerate(idxs, 1):
        row = df_split.iloc[int(idx)]
        text = str(row["text_clean"])
        y_true = int(row["label"])

        r = ensemble_predict(text, use_override=use_override, override_conf=float(CONFIG.get("override_conf", 0.92)))
        y_pred = int(r["pred"])

        ok = (y_pred == y_true)
        correct += int(ok)

        print("="*90)
        print(f"[{t}/{len(idxs)}] TRUE={y_true} | PRED={y_pred} | {'✅' if ok else '❌'}")
        print(f"FINAL: P(FAKE)={r['p_fake_final']:.3f} | Conf={r['conf_final']:.1%} | thr={r['thr']:.2f} | w_electra={r['w']:.2f}")
        print(f"Electra: P(FAKE)={r['electra']['p_fake']:.3f} | Conf={r['electra']['conf']:.3f}")
        print(f"PhoBERT: P(FAKE)={r['phobert']['p_fake']:.3f} | Conf={r['phobert']['conf']:.3f}")
        print("-"*90)
        print(text[:1200])  # in bớt dài cho dễ xem

    print("="*90)
    print(f"Random-check accuracy: {correct}/{len(idxs)} = {correct/len(idxs)*100:.1f}%")
    print("="*90)

# ✅ Test random trong TEST split
random_test_samples(test_df, k=20, seed=2026, use_override=False)


[1/20] TRUE=0 | PRED=0 | ✅
FINAL: P(FAKE)=0.004 | Conf=99.6% | thr=0.20 | w_electra=0.05
Electra: P(FAKE)=0.002 | Conf=0.998
PhoBERT: P(FAKE)=0.005 | Conf=0.995
------------------------------------------------------------------------------------------
Nỗi lo khi hộ kinh doanh có doanh thu 3 tỷ đồng mới áp dụng hóa đơn điện tử. Chuyên gia cho rằng, hộ kinh doanh có doanh thu 1 tỷ đồng áp dụng hóa đơn điện tử là hợp lý, còn nếu nâng ngưỡng doanh thu lên 3 tỷ đồng mới áp dụng sẽ dẫn tới hệ quả tiêu cực.
[2/20] TRUE=0 | PRED=0 | ✅
FINAL: P(FAKE)=0.005 | Conf=99.5% | thr=0.20 | w_electra=0.05
Electra: P(FAKE)=0.002 | Conf=0.998
PhoBERT: P(FAKE)=0.005 | Conf=0.995
------------------------------------------------------------------------------------------
Diễn viên Vân Trang viên mãn với cuộc hôn nhân 10 năm bên chồng doanh nhân. 10 năm hôn nhân, Vân Trang có tổ ấm hạnh phúc bên chồng doanh nhân. Cả 2 có 4 người con, sống sung túc trong biệt thự, nhà vườn ở cả TPHCM và Tiền Giang.
[3/20] TRUE=

In [None]:
# ===== 10 OUT-OF-DATASET TESTS (5 REAL, 5 FAKE) =====
tests = [
    ("REAL-1", 0, "Bộ Công Thương cho biết trong quý I/2026, tổng mức bán lẻ hàng hóa và doanh thu dịch vụ tiêu dùng tăng so với cùng kỳ; cơ quan này đang phối hợp các địa phương theo dõi cung–cầu, bình ổn giá dịp lễ."),
    ("REAL-2", 0, "Ngân hàng Nhà nước thông báo tiếp tục điều hành chính sách tiền tệ theo mục tiêu kiểm soát lạm phát, ổn định tỷ giá; đồng thời yêu cầu các tổ chức tín dụng rà soát lãi suất cho vay với lĩnh vực ưu tiên."),
    ("REAL-3", 0, "Theo Trung tâm Dự báo Khí tượng Thủy văn Quốc gia, không khí lạnh tăng cường khiến miền Bắc chuyển rét; khu vực Trung Bộ có mưa rào rải rác, người dân cần đề phòng dông lốc và sạt lở ở vùng núi."),
    ("REAL-4", 0, "UBND TP.HCM công bố kế hoạch chỉnh trang một số tuyến đường nội đô, ưu tiên cải tạo hệ thống thoát nước và vỉa hè tại các điểm thường xuyên ngập sau mưa lớn."),
    ("REAL-5", 0, "Bộ Y tế khuyến cáo người dân tiêm nhắc lại vắc-xin theo hướng dẫn, không tự ý dùng thuốc kháng sinh khi có triệu chứng hô hấp; các cơ sở y tế tăng cường giám sát ca bệnh theo quy định."),

    ("FAKE-1", 1, "KHẨN CẤP!!! Chính phủ đã ra lệnh ‘đóng cửa toàn quốc’ từ ngày mai nhưng báo chí bị cấm đăng. Ai đọc được tin này hãy SHARE NGAY để cứu gia đình bạn!!!"),
    ("FAKE-2", 1, "Tin sốc: Chỉ cần uống nước chanh pha muối i-ốt mỗi sáng là ‘diệt sạch’ mọi virus trong cơ thể, bác sĩ nước ngoài xác nhận 100% hiệu quả — không cần tiêm vắc-xin nữa!"),
    ("FAKE-3", 1, "Bí mật bị che giấu: Một thiết bị phát sóng lạ đang được lắp trên cột điện để ‘điều khiển não bộ’. Nhiều người đã mất ngủ và đau đầu sau 3 ngày — tuyệt đối không ra đường!"),
    ("FAKE-4", 1, "Thủ tướng vừa ký quyết định ‘xóa toàn bộ nợ ngân hàng’ cho mọi công dân Việt Nam. Ai có nợ chỉ cần nhắn CCCD + số tài khoản vào inbox để được duyệt ngay hôm nay!"),
    ("FAKE-5", 1, "WHO thừa nhận vắc-xin gây vô sinh hàng loạt và yêu cầu các nước dừng tiêm ngay lập tức. Thông tin này đã bị xóa khỏi internet, chỉ còn bản chụp màn hình, xem nhanh kẻo mất!"),
]

def run_generalization_test(tests, use_override=False):
    correct = 0
    print("="*110)
    print("GENERALIZATION TEST (10 samples) | expected: 0=REAL, 1=FAKE")
    print("="*110)
    for name, y_true, text in tests:
        t = clean_text(text)
        r = ensemble_predict(t, use_override=use_override, override_conf=float(CONFIG.get("override_conf", 0.92)))

        y_pred = int(r["pred"])
        ok = (y_pred == y_true)
        correct += int(ok)

        print(f"\n{name} | TRUE={y_true} | PRED={y_pred} | {'✅' if ok else '❌'}")
        print(f"FINAL: P(FAKE)={r['p_fake_final']:.3f} | Conf={r['conf_final']:.1%} | thr={r['thr']:.2f} | w_electra={r['w']:.2f}")
        print(f"Electra: P(FAKE)={r['electra']['p_fake']:.3f} | Conf={r['electra']['conf']:.3f}")
        print(f"PhoBERT: P(FAKE)={r['phobert']['p_fake']:.3f} | Conf={r['phobert']['conf']:.3f}")
        print("-"*110)

    print("\n" + "="*110)
    print(f"RESULT: {correct}/{len(tests)} = {correct/len(tests)*100:.1f}%")
    print("="*110)

# chạy:
run_generalization_test(tests, use_override=False)


GENERALIZATION TEST (10 samples) | expected: 0=REAL, 1=FAKE

REAL-1 | TRUE=0 | PRED=0 | ✅
FINAL: P(FAKE)=0.006 | Conf=99.4% | thr=0.20 | w_electra=0.05
Electra: P(FAKE)=0.004 | Conf=0.996
PhoBERT: P(FAKE)=0.007 | Conf=0.993
--------------------------------------------------------------------------------------------------------------

REAL-2 | TRUE=0 | PRED=1 | ❌
FINAL: P(FAKE)=0.407 | Conf=40.7% | thr=0.20 | w_electra=0.05
Electra: P(FAKE)=0.005 | Conf=0.995
PhoBERT: P(FAKE)=0.428 | Conf=0.572
--------------------------------------------------------------------------------------------------------------

REAL-3 | TRUE=0 | PRED=0 | ✅
FINAL: P(FAKE)=0.165 | Conf=83.5% | thr=0.20 | w_electra=0.05
Electra: P(FAKE)=0.007 | Conf=0.993
PhoBERT: P(FAKE)=0.173 | Conf=0.827
--------------------------------------------------------------------------------------------------------------

REAL-4 | TRUE=0 | PRED=0 | ✅
FINAL: P(FAKE)=0.011 | Conf=98.9% | thr=0.20 | w_electra=0.05
Electra: P(FAKE)=0.056 