In [None]:
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from torch.cuda.amp import autocast, GradScaler
import matplotlib.pyplot as plt
import seaborn as sns
import gc # cleaning memory
import time
from datetime import timedelta
from tqdm.notebook import tqdm # tracking training process
import warnings
import copy
warnings.filterwarnings("ignore")
torch.autograd.set_detect_anomaly(False)

from transformers import logging
logging.set_verbosity_error()

# Paths
BASE_DIR   = "/content/drive/MyDrive/Colab Notebooks/data-science-challenge-competition"
DATA_DIR   = os.path.join(BASE_DIR, "data")
MODEL_DIR  = os.path.join(BASE_DIR, "model")
RESULT_DIR = os.path.join(BASE_DIR, "result")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULT_DIR, exist_ok=True)

best_path = os.path.join(MODEL_DIR, "best_overall_model.pt")
submit_path = os.path.join(RESULT_DIR, "submit.csv")

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
# 2. Load Data (JSONL)
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

train_records = read_jsonl(os.path.join(DATA_DIR, "processed/vihallu-train-split.jsonl"))
val_records   = read_jsonl(os.path.join(DATA_DIR, "processed/vihallu-val-split.jsonl"))
test_records  = read_jsonl(os.path.join(DATA_DIR, "jsonl/vihallu-public-test.jsonl"))

train_df = pd.DataFrame(train_records).fillna({"context":"", "prompt":"", "response":""})
val_df   = pd.DataFrame(val_records).fillna({"context":"", "prompt":"", "response":""})
df_test  = pd.DataFrame(test_records).fillna({"context":"", "prompt":"", "response":""})

# Combine train and val for K-Fold Cross Validation
full_train_df = pd.concat([train_df, val_df], ignore_index=True).fillna({"context":"", "prompt":"", "response":""})
df_test = df_test.fillna({"context":"", "prompt":"", "response":""})

# --- Label Mapping ---
labels = sorted(full_train_df['label'].unique().tolist())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for l, i in label2id.items()}

full_train_df['label'] = full_train_df['label'].map(label2id)

print(f"Total training data size for K-Fold: {len(full_train_df)}")
print(f"Test data size: {len(df_test)}")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def clean_memory():
    """Cleans GPU memory before starting training."""
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
# Configuration and Hyperparameters

class CFG:
    fusion_type = "concat"  # options: "concat", "bilinear", "gated"
    freeze_backbone = False  # nếu muốn thử chỉ train các head
    
    # Model & Tokenizer
    model_name = "uitnlp/CafeBERT"
    max_length = 512

    # Training
    batch_size = 24
    gradient_accumulation_steps = 2
    num_train_epochs = 15
    fp16 = True
    early_stopping_patience = 3
    max_grad_norm = 1.0

    # Optimizer
    learning_rate = 2e-5
    weight_decay = 0.05

    # Loss Weights
    alpha = 0.7
    beta = 0.3

    # Model Architecture
    latent_dim = 384
    dropout_rate = 0.3
    bilstm_hidden_size = 384
    bilstm_num_layers = 1
    transformer_num_heads = 8
    transformer_num_layers = 2

    # K-Fold
    n_splits = 5
    seed = 42

# --- Initialize Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(
    CFG.model_name,
    use_fast=False,
    trust_remote_code=True
)

In [None]:
# Dataset Class (With Pseudo-label)

class HallucinationDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.contexts = df['context'].values
        self.prompts = df['prompt'].values
        self.responses = df['response'].values
        if 'label' in df.columns:
            self.labels = df['label'].values
        else:
            self.labels = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        context = str(self.contexts[idx])
        prompt = str(self.prompts[idx])
        response = str(self.responses[idx])

        # Context: 0, prompt+response: 1
        encoding = self.tokenizer(
            context,
            prompt + " " + response,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt',
            return_token_type_ids=True
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten()
        }

        if self.labels is not None:
            real_label = self.labels[idx]
            item['real_label'] = torch.tensor(real_label, dtype=torch.long)

            if real_label == label2id['no']: pseudo_label_nli = 0
            elif real_label == label2id['intrinsic']: pseudo_label_nli = 1
            else: pseudo_label_nli = 2

            pseudo_label_coverage = 0.0 if real_label == label2id['extrinsic'] else 1.0

            item['pseudo_label_nli'] = torch.tensor(pseudo_label_nli, dtype=torch.long)
            item['pseudo_label_coverage'] = torch.tensor(pseudo_label_coverage, dtype=torch.float)

        return item

In [None]:
# Multi-Task Hallucination  Model

class HallucinationModel(nn.Module):
    def __init__(self, model_name, cfg):
        super(HallucinationModel, self).__init__()
        self.cfg = cfg
        self.backbone = AutoModel.from_pretrained(model_name)
        bert_hidden_size = self.backbone.config.hidden_size
        
        if cfg.freeze_backbone:
            for param in self.backbone.parameters():
                param.requires_grad = False

        self.bilstm = nn.LSTM(
            bert_hidden_size * 4,
            cfg.bilstm_hidden_size,
            cfg.bilstm_num_layers,
            bidirectional=True,
            batch_first=True
        )

        transformer_layer = nn.TransformerEncoderLayer(
            d_model=cfg.bilstm_hidden_size * 2, # d_model phải khớp với output của BiLSTM
            nhead=cfg.transformer_num_heads,
            dim_feedforward=cfg.bilstm_hidden_size * 2 * 4,
            dropout=cfg.dropout_rate,
            activation='relu',
            batch_first=True
        )

        self.nli_transformer_encoder = nn.TransformerEncoder(
            transformer_layer,
            num_layers=cfg.transformer_num_layers
        )

        self.mlp_nli = nn.Sequential(
            nn.Linear(cfg.bilstm_hidden_size * 2 * 2, cfg.latent_dim),
            nn.ReLU(),
            nn.Dropout(cfg.dropout_rate)
        )
        self.nli_classifier = nn.Linear(cfg.latent_dim, 3)

        self.mlp_coverage = nn.Sequential(
            nn.Linear(2, cfg.latent_dim),
            nn.ReLU(),
            nn.Dropout(cfg.dropout_rate)
        )
        self.coverage_regressor = nn.Linear(cfg.latent_dim, 1)

        self.mlp_norm_nli = nn.LayerNorm(cfg.latent_dim)
        self.mlp_norm_coverage = nn.LayerNorm(cfg.latent_dim)

        self.bilinear = nn.Bilinear(cfg.latent_dim, cfg.latent_dim, cfg.latent_dim)
        self.final_classifier = nn.Sequential(
            nn.Linear(cfg.latent_dim * 3, cfg.latent_dim),
            nn.ReLU(),
            nn.Dropout(cfg.dropout_rate),
            nn.Linear(cfg.latent_dim, len(label2id))
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )
        H_all_tokens = outputs.hidden_states[-1]

        # --- Tách H_context và H_response một cách hiệu quả ---
        # Tạo mặt nạ từ token_type_ids
        # context_mask có giá trị 1 ở vị trí context, 0 ở nơi khác
        context_mask = (token_type_ids == 0) & (attention_mask == 1)
        # response_mask có giá trị 1 ở vị trí response, 0 ở nơi khác
        response_mask = (token_type_ids == 1) & (attention_mask == 1)

        # Mở rộng mặt nạ để nhân với H_all_tokens
        # Kích thước: (batch_size, seq_len, 1)
        context_mask_expanded = context_mask.unsqueeze(-1).float()
        response_mask_expanded = response_mask.unsqueeze(-1).float()

        # Dùng phép nhân để "zero-out" các token không liên quan
        H_context = H_all_tokens * context_mask_expanded
        H_response = H_all_tokens * response_mask_expanded

        # NLI Head
        e = torch.matmul(H_context, H_response.transpose(1, 2))
        tilde_H_context = torch.matmul(torch.softmax(e, dim=2), H_response)
        M_context = torch.cat([H_context, tilde_H_context, H_context - tilde_H_context, H_context * tilde_H_context], dim=-1)

        context_lengths = context_mask.sum(dim=1).cpu()
        context_lengths = torch.clamp(context_lengths, min=1)
        packed_M_context = pack_padded_sequence(M_context, context_lengths, batch_first=True, enforce_sorted=False)
        packed_V_context, _ = self.bilstm(packed_M_context)
        V_context, _ = pad_packed_sequence(packed_V_context, batch_first=True, total_length=self.cfg.max_length)

        src_key_padding_mask = (context_mask == 0)
        V_prime_context = self.nli_transformer_encoder(V_context, src_key_padding_mask=src_key_padding_mask)

        # Pooling
        sum_pooled = torch.sum(V_prime_context, 1)
        count_pooled = context_mask.sum(1, keepdim=True)
        avg_pool = torch.where(count_pooled > 0, sum_pooled / (count_pooled + 1e-9), torch.zeros_like(sum_pooled))
        V_prime_context_masked = V_prime_context.masked_fill(context_mask.unsqueeze(-1).logical_not(), -1e4)
        max_pool, _ = torch.max(V_prime_context_masked, 1)

        latent_nli_vector = self.mlp_nli(torch.cat((avg_pool, max_pool), 1))
        latent_nli_vector = self.mlp_norm_nli(latent_nli_vector) # <--- NORM NLI
        logits_nli = self.nli_classifier(latent_nli_vector)

        # Coverage head
        safe_mask_value = -1e4

        context_token_count = context_mask.sum(1, keepdim=True)
        response_token_count = response_mask.sum(1, keepdim=True)

        # Forward score
        e_masked_forward = e.masked_fill(response_mask.unsqueeze(1).logical_not(), safe_mask_value)
        align_scores_forward, _ = torch.max(e_masked_forward, dim=2)
        sum_scores_forward = torch.sum(align_scores_forward, dim=1, keepdim=True)
        forward_score = torch.where(context_token_count > 0, sum_scores_forward / (context_token_count + 1e-9), torch.zeros_like(sum_scores_forward))

        # Backward score
        e_masked_backward = e.masked_fill(context_mask.unsqueeze(2).logical_not(), safe_mask_value)
        align_scores_backward, _ = torch.max(e_masked_backward, dim=1)
        sum_scores_backward = torch.sum(align_scores_backward, dim=1, keepdim=True)
        backward_score = torch.where(response_token_count > 0, sum_scores_backward / (response_token_count + 1e-9), torch.zeros_like(sum_scores_backward))

        latent_coverage_vector = self.mlp_coverage(torch.cat([forward_score, backward_score], dim=1))
        latent_coverage_vector = self.mlp_norm_coverage(latent_coverage_vector)
        predicted_score_coverage = self.coverage_regressor(latent_coverage_vector)

        # Fusion
        if self.cfg.fusion_type == "bilinear":
            fusion_part = self.fusion(latent_nli_vector, latent_coverage_vector)
            fused_vector = torch.cat([latent_nli_vector, latent_coverage_vector, fusion_part], dim=1)
        elif self.cfg.fusion_type == "gated":
            gate_val = torch.sigmoid(self.gate(torch.cat([latent_nli_vector, latent_coverage_vector], dim=1)))
            fused_vector = gate_val * latent_nli_vector + (1 - gate_val) * latent_coverage_vector
        else:  # concat
            fused_vector = torch.cat([latent_nli_vector, latent_coverage_vector], dim=1)

        final_logits = self.final_classifier(fused_vector)

        return {"final_logits": final_logits, "logits_nli": logits_nli, "predicted_score_coverage": predicted_score_coverage.squeeze(-1)}

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, scaler, device, cfg, loss_fns):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)

    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        real_labels = batch['real_label'].to(device)
        pseudo_nli_labels = batch['pseudo_label_nli'].to(device)
        pseudo_cov_labels = batch['pseudo_label_coverage'].to(device)

        with autocast(enabled=cfg.fp16):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            loss_classifier = loss_fns['classifier'](outputs['final_logits'], real_labels)
            logits_nli_fp32 = outputs['logits_nli'].float()
            pred_score_cov_fp32 = outputs['predicted_score_coverage'].float()
            loss_nli = loss_fns['nli'](logits_nli_fp32, pseudo_nli_labels)
            loss_coverage = loss_fns['coverage'](pred_score_cov_fp32, pseudo_cov_labels)
            loss = loss_classifier + cfg.alpha * loss_nli + cfg.beta * loss_coverage

        loss = loss / cfg.gradient_accumulation_steps
        scaler.scale(loss).backward()

        if (step + 1) % cfg.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

        total_loss += loss.item() * cfg.gradient_accumulation_steps
        progress_bar.set_postfix({'loss': total_loss / (step + 1)})

    return total_loss / len(dataloader.dataset)

# --- Cập nhật hàm eval_model ---
def eval_model(model, dataloader, device):
    model.eval()
    predictions = []
    actuals = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

            preds = torch.argmax(outputs['final_logits'], dim=1)
            predictions.extend(preds.cpu().numpy())
            actuals.extend(batch['real_label'].cpu().numpy())

    # Tính toán các chỉ số
    macro_f1 = f1_score(actuals, predictions, average='macro', zero_division=0)
    cm = confusion_matrix(actuals, predictions, labels=list(id2label.keys()))

    # --- TẠO RA BÁO CÁO DẠNG CHUỖI ---
    report_str = classification_report(
        actuals,
        predictions,
        target_names=labels,
        zero_division=0,
        digits=4 # Thêm 4 chữ số thập phân cho chi tiết
    )

    # Trả về báo cáo dạng chuỗi
    return macro_f1, cm, report_str, actuals, predictions

In [None]:
best_model_state = None
best_overall_f1 = -1
best_val_idx = None # Lưu lại index của tập val tốt nhất

skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)

# Vòng lặp chính cho K-Fold
for fold, (train_idx, val_idx) in enumerate(skf.split(full_train_df, full_train_df['label'])):
    print(f"==================== FOLD {fold+1}/{CFG.n_splits} ====================")
    clean_memory() # Dọn dẹp VRAM trước khi bắt đầu một fold mới

    # 1. Chia dữ liệu, tính weights, tạo Dataloader
    train_fold_df = full_train_df.iloc[train_idx]
    val_fold_df = full_train_df.iloc[val_idx]

    label_counts = train_fold_df['label'].value_counts().sort_index()
    class_weights = (len(train_fold_df) / (len(labels) * label_counts)).values
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    print(f"Calculated Class Weights: {class_weights_tensor.cpu().numpy()}")

    train_dataset = HallucinationDataset(train_fold_df, tokenizer, CFG.max_length)
    val_dataset = HallucinationDataset(val_fold_df, tokenizer, CFG.max_length)

    train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size * 2, shuffle=False, num_workers=2, pin_memory=True)

    # 2. Khởi tạo mô hình và các thành phần huấn luyện
    model = HallucinationModel(CFG.model_name, CFG).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)
    num_training_steps = len(train_loader) * CFG.num_train_epochs // CFG.gradient_accumulation_steps
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_training_steps * 0.1),
        num_training_steps=num_training_steps
    )
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.fp16)

    loss_fns = {
        'classifier': nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.1),
        'nli': nn.CrossEntropyLoss(label_smoothing=0.1),
        'coverage': nn.MSELoss()
    }

    # 3. Vòng lặp huấn luyện cho từng epoch
    best_fold_f1 = -1
    patience_counter = 0

    for epoch in range(CFG.num_train_epochs):
        start_time = time.time()
        print(f"\n--- Epoch {epoch+1}/{CFG.num_train_epochs} ---")

        avg_train_loss = train_epoch(model, train_loader, optimizer, scheduler, scaler, device, CFG, loss_fns)

        epoch_duration_secs = time.time() - start_time
        print(f"Epoch duration: {str(timedelta(seconds=epoch_duration_secs))}")

        # --- Báo cáo chi tiết cuối Epoch ---
        macro_f1, cm, report_str, val_actuals, val_preds = eval_model(model, val_loader, device)

        print(f"  -> Train Loss: {avg_train_loss:.4f}")
        print(f"  -> Macro-F1: {macro_f1:.4f}")
        print("\nClassification Report:\n", report_str)

        # Trực quan hóa và đóng biểu đồ ngay lập tức
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f'Confusion Matrix - Fold {fold+1} Epoch {epoch+1}')
        plt.show()
        plt.close() # <-- ĐÓNG FIGURE ĐỂ GIẢI PHÓNG BỘ NHỚ

        # --- Logic của Early Stopping và lưu model tốt nhất ---
        if macro_f1 > best_fold_f1:
            best_fold_f1 = macro_f1
            patience_counter = 0
            if best_fold_f1 > best_overall_f1:
                best_overall_f1 = best_fold_f1
                print(f"BEST MODEL FOUND! Saving state and val_idx...")
                best_model_state = copy.deepcopy(model.state_dict())
                best_val_idx = val_idx # Lưu lại val_idx tương ứng
        else:
            patience_counter += 1
            print(f"No improvement in this fold. Patience: {patience_counter}/{CFG.early_stopping_patience}")
            if patience_counter >= CFG.early_stopping_patience:
                print("Early stopping for this fold!")
                break

        # --- DỌN DẸP CUỐI EPOCH (LOGIC ĐÚNG) ---
        del macro_f1, cm, report_str, val_actuals, val_preds, avg_train_loss
        clean_memory()
        # ----------------------------------------

    # --- Dọn dẹp cuối FOLD ---
    del model, optimizer, scheduler, scaler, loss_fns, train_loader, val_loader, train_dataset, val_dataset
    print("-" * 60)

print("\n==================== K-FOLD TRAINING COMPLETE ====================")

# --- ĐÁNH GIÁ MÔ HÌNH TỐT NHẤT TỪ BIẾN ĐÃ LƯU ---
if best_model_state is not None:
    print(f"The best overall model achieved a Macro-F1 of {best_overall_f1:.4f} during training.")
    print("\n--- Evaluating the best overall model on its corresponding best validation set ---")

    # 1. Dọn dẹp VRAM trước khi tải mô hình mới để đánh giá
    clean_memory()

    # 2. Khởi tạo một kiến trúc mô hình mới và tải state_dict (trọng số) đã lưu vào
    best_model = HallucinationModel(CFG.model_name, CFG).to(device)
    best_model.load_state_dict(best_model_state)

    # 3. Tạo DataLoader từ tập validation TƯƠNG ỨNG với mô hình tốt nhất
    # Sử dụng `best_val_idx` đã được lưu lại trong quá trình training
    best_val_df = full_train_df.iloc[best_val_idx]
    final_val_dataset = HallucinationDataset(best_val_df, tokenizer, CFG.max_length)
    final_val_loader = DataLoader(
        final_val_dataset,
        batch_size=CFG.batch_size * 4, # Có thể dùng batch size lớn hơn khi đánh giá
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )

    # 4. Thực hiện đánh giá chỉ một lần
    final_macro_f1, final_cm, final_report_str, _, _ = eval_model(best_model, final_val_loader, device)

    # 5. In ra các kết quả chi tiết cuối cùng
    print(f"\nRe-evaluated Macro-F1 on its best fold's validation set: {final_macro_f1:.4f}")

    print("\nFinal Classification Report:\n")
    print(final_report_str)

    # 6. Trực quan hóa Confusion Matrix cuối cùng
    print("\nFinal Confusion Matrix:\n")
    plt.figure(figsize=(8, 6))
    sns.heatmap(final_cm, annot=True, fmt='d', cmap='Greens', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Final Confusion Matrix on Best Model (F1: {final_macro_f1:.4f})')
    plt.show()
    plt.close()

else:
    print("\nTraining finished, but no best model was saved in memory (best_model_state is None).")