In [1]:
import pandas as pd

def process_csv(input_path, output_path):
    # Đọc file CSV
    df = pd.read_csv(input_path)

    # 1. Xóa cột index thừa nếu có
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    # 2. Xóa dòng trống trong context, prompt, response
    before = len(df)
    df = df.dropna(subset=["context", "prompt", "response"])
    after = len(df)
    removed = before - after
    if removed > 0:
        print(f"⚠️ Đã xóa {removed} dòng bị trống trong context, prompt hoặc response")

    # 3. Thêm cột premise = prompt + [SEP] + context
    df["premise"] = df["prompt"].astype(str) + " [SEP] " + df["context"].astype(str)

    # 4. Thêm cột hypothesis = response
    df["hypothesis"] = df["response"].astype(str)

    # Lưu lại file mới
    df.to_csv(output_path, index=False)
    print(f"✅ Đã xử lý và lưu file tại: {output_path}")

    return df

In [2]:
processed_df = process_csv("/kaggle/input/nli-vihallucination/train.csv", "train_processed.csv")


⚠️ Đã xóa 10 dòng bị trống trong context, prompt hoặc response
✅ Đã xử lý và lưu file tại: train_processed.csv


In [3]:
# =========================================================================================
# === BƯỚC 3 (CHIẾN LƯỢC SOTA): HUẤN LUYỆN CROSS-ENCODER NÂNG CAO ==========================
# =========================================================================================
print("Installing necessary libraries...")
!pip install -q transformers scikit-learn pandas torch tqdm

import os
import gc
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.swa_utils import AveragedModel, SWALR

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_cosine_schedule_with_warmup,
    DebertaV2ForSequenceClassification,
)

# =========================================================================================
# === 1. CẤU HÌNH VÀ SIÊU THAM SỐ =========================================================
# =========================================================================================
class Config:
    TRAIN_PATH = "/kaggle/working/train_processed.csv"
    TEST_PATH = "/kaggle/input/nli-vihallu/NLI-ViHallu/test.csv"
    WORKDIR = "/kaggle/working/"
    
    MODEL_NAME = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    SEED = 42; MAX_LEN = 512;
    
    TEST_SIZE = 0.1
    EPOCHS = 15
    PATIENCE = 3
    
    USE_SWA = True
    USE_FGM = True
    USE_FOCAL_LOSS = True
    MULTI_SAMPLE_DROPOUT = True
    
    PER_DEVICE_BATCH_SIZE = 2; GRADIENT_ACCUMULATION_STEPS = 8; FP16 = False 
    LR_HEAD = 5e-5; LR_LLRD = [5e-6, 2.5e-6, 1e-6] 
    WARMUP_RATIO = 0.1; WEIGHT_DECAY = 0.01; GRAD_CLIP = 1.0

def seed_everything(seed):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

seed_everything(Config.SEED)

# =========================================================================================
# === 2. CÁC LỚP VÀ HÀM NÂNG CAO ==========================================================
# =========================================================================================
class FGM():
    def __init__(self, model): self.model = model; self.backup = {}
    def attack(self, epsilon=1., emb_name='deberta.embeddings.word_embeddings.weight'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm): param.data.add_(epsilon * param.grad / norm)
    def restore(self, emb_name='deberta.embeddings.word_embeddings.weight'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name and name in self.backup: param.data = self.backup[name]
        self.backup = {}

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None): super().__init__(); self.alpha=alpha; self.gamma=gamma; self.weight=weight
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.weight)
        pt = torch.exp(-ce_loss); loss = self.alpha * (1 - pt)**self.gamma * ce_loss
        return loss.mean()

class HallucinationNLIModel(DebertaV2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
    
    def forward(self, *args, **kwargs):
        if self.training or not Config.MULTI_SAMPLE_DROPOUT:
            return super().forward(*args, **kwargs)
        
        labels = kwargs.pop("labels", None)
        self.classifier.train()
        
        # <<< FIX: Gọi thẳng hàm forward của lớp cha một cách tường minh >>>
        logits_list = []
        for _ in range(5):
            outputs = DebertaV2ForSequenceClassification.forward(self, *args, **kwargs)
            logits_list.append(outputs.logits)
        
        logits = torch.mean(torch.stack(logits_list, dim=0), dim=0)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        from transformers.modeling_outputs import SequenceClassifierOutput
        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=None, attentions=None)

# =========================================================================================
# === 3. TẢI VÀ CHUẨN BỊ DỮ LIỆU ==========================================================
# =========================================================================================
print(f"Loading pre-processed NLI data...")
train_df = pd.read_csv(Config.TRAIN_PATH); test_df = pd.read_csv(Config.TEST_PATH)
label_mapping = {'no': 0, 'extrinsic': 1, 'intrinsic': 2}
train_df['nli_label'] = train_df['label'].str.lower().map(label_mapping)

train_part_df, val_part_df = train_test_split(train_df, test_size=Config.TEST_SIZE, stratify=train_df['nli_label'], random_state=Config.SEED)
print(f"Data split. Train: {len(train_part_df)}, Val: {len(val_part_df)}")
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)

class NLIDataset(Dataset):
    def __init__(self, df, is_test=False):
        self.premises = df['premise'].astype(str).tolist()
        self.hypotheses = df['hypothesis'].astype(str).tolist()
        self.is_test = is_test
        if not self.is_test: self.labels = df['nli_label'].tolist()
    def __len__(self): return len(self.premises)
    def __getitem__(self, idx):
        item = tokenizer(self.premises[idx], self.hypotheses[idx], padding='max_length', truncation=True, max_length=Config.MAX_LEN, return_tensors='pt')
        item = {k: v.squeeze(0) for k, v in item.items()}
        if not self.is_test: item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_loader = DataLoader(NLIDataset(train_part_df), batch_size=Config.PER_DEVICE_BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(NLIDataset(val_part_df), batch_size=Config.PER_DEVICE_BATCH_SIZE * 2, shuffle=False, num_workers=0)
test_loader = DataLoader(NLIDataset(test_df, is_test=True), batch_size=Config.PER_DEVICE_BATCH_SIZE * 2, shuffle=False, num_workers=0)

# =========================================================================================
# === 4. KHỞI TẠO MÔ HÌNH VÀ OPTIMIZER NÂNG CAO ==========================================
# =========================================================================================
model = HallucinationNLIModel.from_pretrained(Config.MODEL_NAME).to(Config.DEVICE)

def get_llrd_optimizer(model, lr_head, lr_llrd, weight_decay):
    params = list(model.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in params if "classifier" in n or "pooler" in n], "lr": lr_head, "weight_decay": 0.0}]
    num_layers = model.config.num_hidden_layers
    layers = [model.deberta.encoder.layer[i] for i in range(num_layers)]
    for i in range(num_layers):
        lr = lr_llrd[0] if i > num_layers - 5 else (lr_llrd[1] if i > 4 else lr_llrd[2])
        optimizer_grouped_parameters.append({"params": layers[i].parameters(), "lr": lr, "weight_decay": weight_decay})
    optimizer_grouped_parameters.append({"params": model.deberta.embeddings.parameters(), "lr": lr_llrd[2], "weight_decay": weight_decay})
    return torch.optim.AdamW(optimizer_grouped_parameters)

optimizer = get_llrd_optimizer(model, Config.LR_HEAD, Config.LR_LLRD, Config.WEIGHT_DECAY)
num_training_steps = len(train_loader) * Config.EPOCHS // Config.GRADIENT_ACCUMULATION_STEPS
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(num_training_steps*Config.WARMUP_RATIO), num_training_steps=num_training_steps)

class_weights = compute_class_weight('balanced', classes=np.unique(train_part_df['nli_label']), y=train_part_df['nli_label'])
loss_fn = FocalLoss(weight=torch.tensor(class_weights, dtype=torch.float).to(Config.DEVICE))
print("Using Focal Loss with weights:", class_weights)

# =========================================================================================
# === 5. VÒNG LẶP HUẤN LUYỆN NÂNG CAO ======================================================
# =========================================================================================
def evaluate_loop(model, loader):
    model.eval(); all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(Config.DEVICE) for k, v in batch.items() if k != 'token_type_ids'}
            labels = batch.pop("labels")
            logits = model(**batch, labels=labels).logits
            all_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return f1_score(all_labels, all_preds, average='macro')

scaler = torch.amp.GradScaler(enabled=Config.FP16); autocast = torch.amp.autocast(device_type=Config.DEVICE.type, enabled=Config.FP16)
fgm = FGM(model) if Config.USE_FGM else None; best_val_f1 = 0.0; patience_counter = 0; save_path = os.path.join(Config.WORKDIR, "best_nli_sota_model.pth")
swa_model = AveragedModel(model) if Config.USE_SWA else None
swa_scheduler = SWALR(optimizer, swa_lr=1e-5) if Config.USE_SWA else None

print("\n--- Starting SOTA NLI Model Training ---")
for epoch in range(Config.EPOCHS):
    model.train(); running_loss = 0.0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{Config.EPOCHS}")
    for step, batch in enumerate(pbar):
        batch = {k: v.to(Config.DEVICE) for k, v in batch.items() if k != 'token_type_ids'}
        labels = batch.pop("labels")
        with autocast:
            logits = model(**batch).logits
        loss = loss_fn(logits, labels) / Config.GRADIENT_ACCUMULATION_STEPS
        scaler.scale(loss).backward()
        if Config.USE_FGM:
            fgm.attack()
            with autocast: logits_adv = model(**batch).logits
            loss_adv = loss_fn(logits_adv, labels) / Config.GRADIENT_ACCUMULATION_STEPS
            scaler.scale(loss_adv).backward(); fgm.restore()

        if (step + 1) % Config.GRADIENT_ACCUMULATION_STEPS == 0:
            scaler.unscale_(optimizer); torch.nn.utils.clip_grad_norm_(model.parameters(), Config.GRAD_CLIP)
            scaler.step(optimizer); scaler.update(); optimizer.zero_grad()
            if Config.USE_SWA and epoch >= Config.EPOCHS - 3: swa_scheduler.step()
            else: scheduler.step()
        running_loss += loss.item() * Config.GRADIENT_ACCUMULATION_STEPS
        pbar.set_postfix({'loss': running_loss / (step + 1)})
    
    if Config.USE_SWA and epoch >= Config.EPOCHS - 3:
        swa_model.update_parameters(model)

    val_f1 = evaluate_loop(swa_model if Config.USE_SWA and epoch >= Config.EPOCHS - 3 else model, val_loader)
    print(f"Epoch {epoch+1} summary | Val F1: {val_f1:.4f}")
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1; patience_counter = 0
        model_to_save = swa_model.module if Config.USE_SWA and epoch >= Config.EPOCHS - 3 else model
        torch.save(model_to_save.state_dict(), save_path)
        print(f"  -> New best model saved with F1: {best_val_f1:.4f}")
    else:
        patience_counter += 1; print(f"  -> No improvement. Patience: {patience_counter}/{Config.PATIENCE}")
    if patience_counter >= Config.PATIENCE: print("--- Early stopping triggered ---"); break
    gc.collect(); torch.cuda.empty_cache()

# =========================================================================================
# === 6. DỰ ĐOÁN VÀ SUBMISSION ============================================================
# =========================================================================================
print("\nLoading best model for inference..."); 
model_to_load = HallucinationNLIModel.from_pretrained(Config.MODEL_NAME).to(Config.DEVICE)
model_to_load.load_state_dict(torch.load(save_path))
model_to_load.eval()

test_preds = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting on test set"):
        batch = {k: v.to(Config.DEVICE) for k, v in batch.items() if k != 'token_type_ids'}
        logits = model_to_load(**batch).logits
        test_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())

nli_to_original_map = {0: 'no', 1: 'extrinsic', 2: 'intrinsic'}
final_labels = [nli_to_original_map[p] for p in test_preds]
submission_df = pd.DataFrame({'id': test_df['id'], 'predict_label': final_labels})
submission_df.to_csv('submission_nli_sota.csv', index=False)
print("\n--- Submission file 'submission_nli_sota.csv' created successfully! ---"); print(submission_df.head())



Installing necessary libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m107.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m

2025-09-29 07:44:20.290177: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759131860.452981      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759131860.506629      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading pre-processed NLI data...
Data split. Train: 6291, Val: 699


tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Using Focal Loss with weights: [1.03966287 1.01108968 0.95318182]

--- Starting SOTA NLI Model Training ---


Epoch 1/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 1 summary | Val F1: 0.6795
  -> New best model saved with F1: 0.6795


Epoch 2/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 2 summary | Val F1: 0.7582
  -> New best model saved with F1: 0.7582


Epoch 3/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 3 summary | Val F1: 0.7621
  -> New best model saved with F1: 0.7621


Epoch 4/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 4 summary | Val F1: 0.7757
  -> New best model saved with F1: 0.7757


Epoch 5/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 5 summary | Val F1: 0.7796
  -> New best model saved with F1: 0.7796


Epoch 6/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 6 summary | Val F1: 0.7791
  -> No improvement. Patience: 1/3


Epoch 7/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 7 summary | Val F1: 0.7844
  -> New best model saved with F1: 0.7844


Epoch 8/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 8 summary | Val F1: 0.7910
  -> New best model saved with F1: 0.7910


Epoch 9/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 9 summary | Val F1: 0.7834
  -> No improvement. Patience: 1/3


Epoch 10/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 10 summary | Val F1: 0.7885
  -> No improvement. Patience: 2/3


Epoch 11/15:   0%|          | 0/3146 [00:00<?, ?it/s]

Epoch 11 summary | Val F1: 0.7864
  -> No improvement. Patience: 3/3
--- Early stopping triggered ---

Loading best model for inference...


Predicting on test set:   0%|          | 0/250 [00:00<?, ?it/s]


--- Submission file 'submission_nli_sota.csv' created successfully! ---
                                     id predict_label
0  b709059b-b3b6-4ac2-bb88-2c794e2cc219     extrinsic
1  7dc35ef5-c4b7-4538-ab90-627b9cbd896e     extrinsic
2  cfdfa010-f61c-4845-91c9-23f79be2b88b     extrinsic
3  31b33c97-2f59-4e72-8707-f47de204d7f9     intrinsic
4  a2c83a00-e8b7-4236-86ce-5e0104df074a     intrinsic
