In [1]:
!pip install wandb
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wnb_token  = user_secrets.get_secret("wandb")
wnb_name = 'hallu'
wandb.login(key=wnb_token)
wandb.init(name=wnb_name)



[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhaduong058a[0m ([33mhaduong058a-hcmussh-edu-vn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
    DataCollatorWithPadding
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os, json

# -----------------------------
# Dataset class
# -----------------------------
class VIHalluDataset(Dataset):
    def __init__(self, contexts, questions, responses, labels, tokenizer, max_length=512):
        self.contexts = contexts
        self.questions = questions
        self.responses = responses
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        context = str(self.contexts[idx])
        question = str(self.questions[idx])
        response = str(self.responses[idx])
        
        # Ghép input
        input_text = f"{question} [SEP] {context} [SEP] {response}"
        
        encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# -----------------------------
# Metrics
# -----------------------------
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# -----------------------------
# Main pipeline
# -----------------------------
def main():
    # Dùng model mới
    model_name = "timpal0l/mdeberta-v3-base-squad2"
    train_path = "/kaggle/input/vihallu-train/vihallu-train.csv"
    val_path = "//kaggle/input/d/nguyendinhhaduong/vihallu/vihallu-warmup.csv"
    output_dir = "/kaggle/working/vihallu_model"

    # Load data
    print("Loading data...")
    df_train = pd.read_csv(train_path).dropna()
    df_val = pd.read_csv(val_path).dropna()

    # Column mapping
    contexts_train = df_train["context"].tolist() if "context" in df_train.columns else df_train["document"].tolist()
    questions_train = df_train["question"].tolist() if "question" in df_train.columns else df_train["prompt"].tolist()
    responses_train = df_train["response"].tolist() if "response" in df_train.columns else df_train["answer"].tolist()
    labels_train = [str(l).lower() for l in df_train["label"].tolist()]

    contexts_val = df_val["context"].tolist() if "context" in df_val.columns else df_val["document"].tolist()
    questions_val = df_val["question"].tolist() if "question" in df_val.columns else df_val["prompt"].tolist()
    responses_val = df_val["response"].tolist() if "response" in df_val.columns else df_val["answer"].tolist()
    labels_val = [str(l).lower() for l in df_val["label"].tolist()]

    # Encode labels
    label_encoder = LabelEncoder()
    all_labels = labels_train + labels_val
    label_encoder.fit(all_labels)
    y_train = label_encoder.transform(labels_train)
    y_val = label_encoder.transform(labels_val)

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({'sep_token': '[SEP]'})

    # Dataset
    print("Tokenizing training data...")
    train_dataset = VIHalluDataset(contexts_train, questions_train, responses_train, y_train, tokenizer)
    for _ in tqdm(train_dataset, desc="Train samples", total=len(train_dataset)):
        pass

    print("Tokenizing validation data...")
    val_dataset = VIHalluDataset(contexts_val, questions_val, responses_val, y_val, tokenizer)
    for _ in tqdm(val_dataset, desc="Val samples", total=len(val_dataset)):
        pass

    # Model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label_encoder.classes_),
        ignore_mismatched_sizes=True
    )
    model.resize_token_embeddings(len(tokenizer))

    # Training args
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=1,   # batch size
        per_device_eval_batch_size=8,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        save_total_limit=2,
        learning_rate=2e-5,
        report_to='wandb',
        logging_strategy="steps"
    )

    data_collator = DataCollatorWithPadding(tokenizer)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train
    print("Starting training...")
    trainer.train()
    trainer.save_model(output_dir)

    # Evaluate
    predictions = trainer.predict(val_dataset)
    preds = np.argmax(predictions.predictions, axis=1)

    true_labels = predictions.label_ids
    acc = accuracy_score(true_labels, preds)
    report = classification_report(
        label_encoder.inverse_transform(true_labels),
        label_encoder.inverse_transform(preds),
        target_names=label_encoder.classes_
    )
    cm = confusion_matrix(
        label_encoder.inverse_transform(true_labels),
        label_encoder.inverse_transform(preds),
        labels=label_encoder.classes_
    )

    print(f"Validation Accuracy: {acc:.4f}")
    print(report)

    # Confusion matrix plot
    os.makedirs(output_dir, exist_ok=True)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.savefig(f"{output_dir}/confusion_matrix.png", dpi=300, bbox_inches="tight")

    # Save results
    results = {
        "accuracy": acc,
        "classification_report": report,
        "preds": label_encoder.inverse_transform(preds).tolist(),
        "true_labels": label_encoder.inverse_transform(true_labels).tolist()
    }
    with open(f"{output_dir}/eval_results.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()


2025-09-18 04:15:29.764284: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758168930.160242      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758168930.268665      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading data...


tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Tokenizing training data...


Train samples: 100%|██████████| 7000/7000 [00:09<00:00, 704.60it/s]


Tokenizing validation data...


Val samples: 100%|██████████| 198/198 [00:00<00:00, 757.33it/s]


config.json:   0%|          | 0.00/961 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mathislucka/deberta-large-hallucination-eval-v3 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1, 1024]) in the checkpoint and torch.Size([3, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...




Epoch,Training Loss,Validation Loss


main()