# Vietnamese Hallucination Detection (Cross-Encoder Reranker → 3-class Classifier)

This notebook fine-tunes **`AITeamVN/Vietnamese_Reranker`** (cross-encoder) to classify
each (context, response) pair into **`no`**, **`intrinsic`**, **`extrinsic`**.

Expected CSV columns:
- `context`: source/reference passages concatenated (string)
- `response`: model answer / hypothesis to check (string)
- `label`: one of `no`, `intrinsic`, `extrinsic` (train). Test may omit or leave empty.

If your dataset also has a `prompt`, you can switch the preprocessing to use `(response + prompt)` as the hypothesis.


In [1]:
# 0) Setup & config
import os, warnings, random
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

# File paths (edit if needed)
TRAIN_PATH = "/kaggle/input/llmhallucination/vihallu-train.csv"
TEST_PATH  = "/kaggle/input/hallucination-test/vihallu-public-test.csv"  # change to test file if different

# Model and training hyperparams (Cross-Encoder)
MODEL_NAME  = "AITeamVN/Vietnamese_Reranker"   # <-- switched to Vietnamese reranker
MAX_LENGTH  = 384                                # 256/384/512 depending on memory
LR          = 5e-6                               # a bit lower than encoder-only baseline
EPOCHS      = 3
BATCH_SIZE  = 8                                  # cross-encoder → heavier; reduce if OOM
WEIGHTED_LOSS = True                             # use class weights if imbalance
SEED        = 42
VAL_SPLIT   = 0.1

random.seed(SEED)
np.random.seed(SEED)
print(TRAIN_PATH, TEST_PATH, MODEL_NAME)

/kaggle/input/llmhallucination/vihallu-train.csv /kaggle/input/hallucination-test/vihallu-public-test.csv AITeamVN/Vietnamese_Reranker


In [2]:
# 1) Load data (expect columns: context, response, label)
train_df = pd.read_csv(TRAIN_PATH).fillna("")
test_df  = pd.read_csv(TEST_PATH).fillna("")
assert {"context","response","label"}.issubset(train_df.columns), "train.csv must have context,response,label"
assert {"context","response","predict_label"}.issubset(test_df.columns), "test.csv must have context,response,label (label can be empty)"
print("Train size:", len(train_df), "Test size:", len(test_df))
display(train_df.head(2))

Train size: 7000 Test size: 1000


Unnamed: 0,id,context,prompt,response,label
0,9b1ea51d-d1ff-45ba-8cf1-6a91328e8600,"Vào những năm 1870, hai nhà điêu khắc Augustus...","Vào những năm 1960, nơi nào trở thành trung tâ...",Quảng trường Washington là trung tâm của thế h...,extrinsic
1,db7a89c6-2a6a-42af-beef-58e557ecc819,Cách mạng Tháng Mười đã biến một cuộc chiến tr...,Cách mạng Tháng Mười đã khởi xướng chủ nghĩa p...,Sai. Cách mạng Tháng Mười đã đánh bại chủ nghĩ...,no


In [3]:
# 2) Encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(train_df["label"])  # expects values like [no, intrinsic, extrinsic]
num_labels = len(le.classes_)
print("Classes:", list(le.classes_), "->", num_labels)

# Save classes for later inference
classes_txt = "/kaggle/working/label_classes.txt"
with open(classes_txt, "w", encoding="utf-8") as f:
    for c in le.classes_:
        f.write(str(c).strip()+"\n")
print("Saved:", classes_txt)

Classes: ['extrinsic', 'intrinsic', 'no'] -> 3
Saved: /kaggle/working/label_classes.txt


In [4]:
# 3) Train/Val split (stratified)
from sklearn.model_selection import train_test_split
if len(train_df) > 50 and VAL_SPLIT > 0:
    tr_df, val_df = train_test_split(train_df, test_size=VAL_SPLIT, random_state=SEED, stratify=y)
else:
    tr_df, val_df = train_df.copy(), train_df.iloc[:0].copy()
print("Train rows:", len(tr_df), "Val rows:", len(val_df))

Train rows: 6300 Val rows: 700


In [30]:
# 4) HuggingFace setup (robust head replacement for reranker → 3-class)
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Load checkpoint as-is and ignore head-size mismatch; we'll replace the head below
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    ignore_mismatched_sizes=True,   # <-- avoid loading error (ckpt has out=1)
)
model.to(device)

def _replace_final_linear_head(model, num_labels: int):
    """Replace the final Linear head used to produce logits with out_features=num_labels.
    Tries common attribute names used by reranker checkpoints.
    """
    # 1) classifier.out_proj (some DeBERTa-like heads)
    try:
        if hasattr(model, "classifier") and hasattr(model.classifier, "out_proj") and isinstance(model.classifier.out_proj, nn.Linear):
            in_dim = model.classifier.out_proj.in_features
            model.classifier.out_proj = nn.Linear(in_dim, num_labels)
            nn.init.normal_(model.classifier.out_proj.weight, std=0.02)
            nn.init.zeros_(model.classifier.out_proj.bias)
            print("Replaced head at classifier.out_proj ->", num_labels)
            return True
    except Exception as e:
        pass

    # 2) classifier (simple Linear)
    try:
        if hasattr(model, "classifier") and isinstance(model.classifier, nn.Linear):
            in_dim = model.classifier.in_features
            model.classifier = nn.Linear(in_dim, num_labels)
            nn.init.normal_(model.classifier.weight, std=0.02)
            nn.init.zeros_(model.classifier.bias)
            print("Replaced head at classifier ->", num_labels)
            return True
    except Exception as e:
        pass

    # 3) score (used by some cross-encoders)
    try:
        if hasattr(model, "score") and isinstance(model.score, nn.Linear):
            in_dim = model.score.in_features
            model.score = nn.Linear(in_dim, num_labels)
            nn.init.normal_(model.score.weight, std=0.02)
            nn.init.zeros_(model.score.bias)
            print("Replaced head at score ->", num_labels)
            return True
    except Exception as e:
        pass

    # 4) last resort: find last nn.Linear with out_features==1 and replace
    last_linear = None
    for name, mod in model.named_modules():
        if isinstance(mod, nn.Linear):
            last_linear = (name, mod)
    if last_linear is not None and last_linear[1].out_features == 1:
        name, lin = last_linear
        in_dim = lin.in_features
        new_lin = nn.Linear(in_dim, num_labels)
        nn.init.normal_(new_lin.weight, std=0.02)
        nn.init.zeros_(new_lin.bias)
        # set back into model via attribute traversal
        parts = name.split(".")
        obj = model
        for p in parts[:-1]:
            obj = getattr(obj, p)
        setattr(obj, parts[-1], new_lin)
        print(f"Replaced last nn.Linear '{name}' ->", num_labels)
        return True

    print("WARNING: Could not replace head automatically; model may still output 1-dim logits.")
    return False

# Actually replace to our desired number of labels
_replaced = _replace_final_linear_head(model, num_labels)
model.config.num_labels = num_labels
model.config.problem_type = "single_label_classification"
model.to(device)
if not _replaced:
    # Fallback to continue, but training will likely fail if logits dim != num_labels
    pass

Device: cuda
Replaced head at classifier.out_proj -> 3


In [31]:
# 5) Dataset wrappers (pair encoding: hypothesis=response, evidence=context)
from datasets import Dataset

USE_PROMPT_IF_AVAILABLE = True  # set False to ignore prompt even if present

def build_pair(df):
    # hypothesis: response (+ optional prompt), evidence: context
    if USE_PROMPT_IF_AVAILABLE and "prompt" in df.columns:
        hypo = (df["response"].astype(str) + " \n\n[Prompt]: " + df["prompt"].astype(str)).tolist()
    else:
        hypo = df["response"].astype(str).tolist()
    evid = df["context"].astype(str).tolist()
    return hypo, evid

def preprocess_function(examples):
    return tokenizer(
        examples["hypothesis"], examples["evidence"],
        truncation=True, max_length=MAX_LENGTH, padding=False
    )

def to_hf(df: pd.DataFrame, with_labels=True):
    hypo, evid = build_pair(df)
    dict_data = {"hypothesis": hypo, "evidence": evid}
    if with_labels:
        dict_data["labels"] = le.transform(df["label"]).tolist()
    ds = Dataset.from_dict(dict_data)
    # GIỮ labels; chỉ remove 2 cột text
    return ds.map(
        preprocess_function,
        batched=True,
        remove_columns=["hypothesis", "evidence"]   # <-- KHÔNG xóa "labels"
    )


In [32]:
# 6) Build datasets
ds_train = to_hf(tr_df, with_labels=True)
ds_val   = to_hf(val_df, with_labels=True) if len(val_df) > 0 else None

def compute_metrics(eval_pred):
    # Hỗ trợ cả EvalPrediction lẫn tuple
    try:
        preds = eval_pred.predictions
        labels = eval_pred.label_ids
    except AttributeError:
        preds, labels = eval_pred

    if isinstance(preds, (tuple, list)):
        preds = preds[0]

    preds_cls = np.argmax(preds, axis=-1)
    from sklearn.metrics import f1_score, accuracy_score
    return {
        "accuracy": accuracy_score(labels, preds_cls),
        "macro_f1": f1_score(labels, preds_cls, average="macro"),
    }

Map:   0%|          | 0/6300 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

In [43]:
# 7) Class weights (giữ như cũ)
from collections import Counter
class_weights = None
if WEIGHTED_LOSS and len(tr_df) > 0:
    cnt = Counter(le.transform(tr_df["label"]))
    total = sum(cnt.values())
    class_weights = torch.tensor(
        [total / (num_labels * max(cnt[i], 1)) for i in range(num_labels)],
        dtype=torch.float,
        device=device
    )
    print("Class weights:", class_weights.cpu().numpy())

from transformers import Trainer
import torch.nn as nn

DEBUG_EVERY = 10  # in mỗi 10 bước

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None, **kwargs):
        # Lấy labels & loại khỏi inputs để model KHÔNG tự tính loss nội bộ (tránh MSE)
        labels = inputs.pop("labels")
        outputs = model(**inputs)

        # Lấy logits an toàn
        logits = getattr(outputs, "logits", None)
        if logits is None:
            logits = getattr(outputs, "scores", None)
        if logits is None and isinstance(outputs, (tuple, list)) and len(outputs) > 0:
            logits = outputs[0]
        if logits is None:
            raise RuntimeError("Model output has neither 'logits' nor 'scores'.")

        loss_fct = nn.CrossEntropyLoss(weight=class_weights) if class_weights is not None else nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))

        # 🔎 debug: chắc chắn hàm này đang chạy
        try:
            if self.state.global_step % DEBUG_EVERY == 0:
                print(f"[dbg step {self.state.global_step}] loss={float(loss.detach().cpu()):.4f}")
        except Exception:
            pass

        return (loss, outputs) if return_outputs else loss





Class weights: [1.0115607 0.9532456 1.0390896]


In [20]:
import os
os.environ["WANDB_MODE"] = "disabled"  # tránh treo WANDB cũ


In [44]:
# 8) Training arguments
output_dir = "/kaggle/working/vi_reranker_hallu_cls"
args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=10,   # <-- log 10 bước/lần
    save_total_limit=2,
    seed=SEED,
    do_eval=True if (ds_val is not None and len(ds_val)>0) else False,
)
print("max_steps:", getattr(args, "max_steps", None),
      "| logging_steps:", args.logging_steps)


print(args)

max_steps: -1 | logging_steps: 10
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.N

In [28]:
print(ds_train.column_names)  # phải có: input_ids, attention_mask, (token_type_ids?), labels
assert "labels" in ds_train.column_names


['labels', 'input_ids', 'attention_mask']


In [35]:
# forward: CHỈ truyền inputs (không truyền labels)
feed = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]}
if "token_type_ids" in batch:
    feed["token_type_ids"] = batch["token_type_ids"]

out = model(**feed)

# lấy logits/scores an toàn
logits = getattr(out, "logits", None)
if logits is None:
    logits = getattr(out, "scores", None)
if logits is None and isinstance(out, (tuple, list)) and len(out) > 0:
    logits = out[0]

print("Sanity shapes:",
      {k: (tuple(v.shape) if hasattr(v, "shape") else type(v)) for k, v in batch.items()})
print("Logits shape:", None if logits is None else tuple(logits.shape))

# TỰ tính CE loss (không dùng out.loss)
loss = nn.CrossEntropyLoss()(logits.view(-1, model.config.num_labels), batch["labels"].view(-1))
loss.backward()
print("Sanity loss/backward OK:", float(loss.detach().cpu()))


Sanity shapes: {'labels': (8,), 'input_ids': (8, 384), 'attention_mask': (8, 384)}
Logits shape: (8, 3)
Sanity loss/backward OK: 1.1176613569259644


In [49]:
# 8b) Reset Accelerate/Trainer state trước khi train thật
import os, gc, torch
os.environ["WANDB_MODE"] = "disabled"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

# reset Accelerate nếu có
try:
    from accelerate.state import AcceleratorState
    AcceleratorState._reset_state()
except Exception:
    pass

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(">> Reset done. CUDA:", torch.cuda.is_available())


>> Reset done. CUDA: True


In [50]:
# 9) Train
trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val if (ds_val and len(ds_val)>0) else None,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics if (ds_val and len(ds_val)>0) else None,
)
#trainer.train()

In [51]:
dl = trainer.get_train_dataloader()
print("len(train_dataloader) =", len(dl))
first = next(iter(dl), None)
print("sample keys:", None if first is None else list(first.keys()))


len(train_dataloader) = 788
sample keys: ['labels', 'input_ids', 'attention_mask']


In [53]:
# 9) Train (gọn – 1 lệnh train duy nhất, log đều)
import math
from transformers import DataCollatorWithPadding, TrainerCallback

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

# đảm bảo log đi ra stdout, không gửi integration
try:
    args.report_to = "none"
except:
    pass
try:
    args.logging_steps = 10
    args.logging_first_step = True
    args.disable_tqdm = False
except:
    pass

# Tạo Trainer MỚI sau khi reset
trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val if (ds_val is not None and len(ds_val) > 0) else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if (ds_val is not None and len(ds_val) > 0) else None,
)

# In log mỗi logging_steps
class PrinterCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            logs = {k:v for k,v in logs.items() if not k.startswith("total_")}
            print(f"[step {state.global_step}] {logs}")

trainer.add_callback(PrinterCallback())

steps_per_epoch = math.ceil(len(ds_train) / args.per_device_train_batch_size)
print("Steps/epoch (est.):", steps_per_epoch, "| Batch size:", args.per_device_train_batch_size)

print(">>> START TRAIN")
train_output = trainer.train()    # train đủ num_train_epochs = EPOCHS
print(">>> END TRAIN | global_step:", trainer.state.global_step)

if ds_val is not None and len(ds_val) > 0:
    metrics = trainer.evaluate()
    print("Validation:", metrics)


Steps/epoch (est.): 788 | Batch size: 8
>>> START TRAIN
[dbg step 0] loss=1.1756


Step,Training Loss
1,1.1756
10,1.1926
20,1.1306
30,1.1326
40,1.0655
50,1.1191
60,1.0954
70,1.1176
80,1.1237
90,1.1097


[step 1] {'loss': 1.1756, 'grad_norm': 10.214020729064941, 'learning_rate': 5e-06, 'epoch': 0.0012690355329949238}
[step 10] {'loss': 1.1926, 'grad_norm': 8.094978332519531, 'learning_rate': 4.9809644670050765e-06, 'epoch': 0.012690355329949238}
[dbg step 10] loss=1.0709
[step 20] {'loss': 1.1306, 'grad_norm': 10.42998218536377, 'learning_rate': 4.959813874788494e-06, 'epoch': 0.025380710659898477}
[dbg step 20] loss=1.0357
[step 30] {'loss': 1.1326, 'grad_norm': 12.48512077331543, 'learning_rate': 4.938663282571912e-06, 'epoch': 0.03807106598984772}
[dbg step 30] loss=1.0238
[step 40] {'loss': 1.0655, 'grad_norm': 12.00502872467041, 'learning_rate': 4.91751269035533e-06, 'epoch': 0.050761421319796954}
[dbg step 40] loss=1.0561
[step 50] {'loss': 1.1191, 'grad_norm': 7.028442859649658, 'learning_rate': 4.896362098138748e-06, 'epoch': 0.06345177664974619}
[dbg step 50] loss=1.1245
[step 60] {'loss': 1.0954, 'grad_norm': 8.039970397949219, 'learning_rate': 4.875211505922166e-06, 'epoch':

[step 2364] {'eval_loss': 0.7241652011871338, 'eval_accuracy': 0.73, 'eval_macro_f1': 0.7325335167267438, 'eval_runtime': 25.2385, 'eval_samples_per_second': 27.735, 'eval_steps_per_second': 3.487, 'epoch': 3.0}
Validation: {'eval_loss': 0.7241652011871338, 'eval_accuracy': 0.73, 'eval_macro_f1': 0.7325335167267438, 'eval_runtime': 25.2385, 'eval_samples_per_second': 27.735, 'eval_steps_per_second': 3.487, 'epoch': 3.0}


In [54]:
# 10b) Evaluate on validation (nếu có)
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

if ds_val is not None and len(ds_val) > 0:
    val_pred = trainer.predict(ds_val)
    val_preds = np.argmax(val_pred.predictions, axis=-1)

    print("\n[VALIDATION]")
    print("Val accuracy:", accuracy_score(val_pred.label_ids, val_preds))
    print("Val macro-F1:", f1_score(val_pred.label_ids, val_preds, average="macro"))
    print(classification_report(val_pred.label_ids, val_preds, target_names=le.classes_))

    # (tuỳ chọn) ma trận nhầm lẫn
    cm = confusion_matrix(val_pred.label_ids, val_preds)
    print("Confusion matrix:\n", cm)



[VALIDATION]
Val accuracy: 0.73
Val macro-F1: 0.7325335167267438
              precision    recall  f1-score   support

   extrinsic       0.78      0.74      0.76       231
   intrinsic       0.65      0.68      0.67       245
          no       0.77      0.78      0.77       224

    accuracy                           0.73       700
   macro avg       0.73      0.73      0.73       700
weighted avg       0.73      0.73      0.73       700

Confusion matrix:
 [[170  48  13]
 [ 39 167  39]
 [  8  42 174]]


In [55]:
# 11) Predict test + tạo submit.csv (chuẩn format)
ALLOWED = {"no", "intrinsic", "extrinsic"}

def to_hf_test(df: pd.DataFrame):
    hypo = df["response"].astype(str).tolist()
    if USE_PROMPT_IF_AVAILABLE and "prompt" in df.columns:
        hypo = (df["response"].astype(str) + " \n\n[Prompt]: " + df["prompt"].astype(str)).tolist()
    evid = df["context"].astype(str).tolist()
    ds = Dataset.from_dict({"hypothesis": hypo, "evidence": evid})
    return ds.map(
        preprocess_function, batched=True,
        remove_columns=["hypothesis","evidence"]
    )

# 1) Dự đoán
ds_test  = to_hf_test(test_df)
test_out = trainer.predict(ds_test)
test_pred = np.argmax(test_out.predictions, axis=-1)
test_labels = le.inverse_transform(test_pred)

# 2) Tạo submit.csv: chỉ 2 cột id + predict_label
assert "id" in test_df.columns, "Test CSV phải có cột 'id'!"
submit = pd.DataFrame({
    "id": test_df["id"].astype(str),
    "predict_label": test_labels.astype(str),
})

# 3) Kiểm tra giá trị hợp lệ
bad = set(submit["predict_label"].unique()) - ALLOWED
assert len(bad) == 0, f"Giá trị predict_label không hợp lệ: {bad}"

# 4) Lưu CSV và nén zip
out_csv = "/kaggle/working/submit.csv"
submit.to_csv(out_csv, index=False)
print("Wrote:", out_csv, "-> head:\n", submit.head())

import zipfile
out_zip = "/kaggle/working/submit.zip"
with zipfile.ZipFile(out_zip, "w", zipfile.ZIP_DEFLATED) as z:
    z.write(out_csv, arcname="submit.csv")
print("Zipped:", out_zip)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Wrote: /kaggle/working/submit.csv -> head:
                                      id predict_label
0  b709059b-b3b6-4ac2-bb88-2c794e2cc219     extrinsic
1  7dc35ef5-c4b7-4538-ab90-627b9cbd896e     extrinsic
2  cfdfa010-f61c-4845-91c9-23f79be2b88b     extrinsic
3  31b33c97-2f59-4e72-8707-f47de204d7f9     extrinsic
4  a2c83a00-e8b7-4236-86ce-5e0104df074a     intrinsic
Zipped: /kaggle/working/submit.zip


### Notes
- This treats `(response [+ prompt], context)` as a cross-encoder input.
- For stronger *intrinsic vs. extrinsic* separation, add an NLI head/pipeline after training (optional).
- If you hit CUDA OOM, reduce `BATCH_SIZE` or `MAX_LENGTH`.
