In [None]:
!pip install evaluate
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
label2id = {"no": 0, "intrinsic": 1, "extrinsic": 2}
id2label = {v: k for k, v in label2id.items()}

In [None]:
import pandas as pd
from datasets import load_dataset, concatenate_datasets, DatasetDict

# ======================
# Bước 1: Chuẩn hóa file CSV
# ======================
def align_columns(path, out_path):
    df = pd.read_csv(path)

    # Xoá cột index thừa nếu có
    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    # Thêm cột predict_label nếu chưa có
    if "predict_label" not in df.columns:
        df["predict_label"] = None

    # Chuẩn hoá kiểu dữ liệu
    if "label" in df.columns:
        df["label"] = df["label"].astype(str)
    if "predict_label" in df.columns:
        df["predict_label"] = df["predict_label"].astype(float)

    # Xoá dòng rỗng trong context/prompt/response
    df = df.dropna(subset=["context", "prompt", "response"])

    # Ghi lại file chuẩn hoá
    df.to_csv(out_path, index=False)

# Chạy chuẩn hoá cho tất cả file
align_columns("/content/vihallu_train.csv", "/content/train-fixed.csv")
align_columns("/content/test.csv", "/content/test-fixed.csv")
align_columns("/content/warmup.csv", "/content/valid-fixed.csv")

# ======================
# Bước 2: Load dữ liệu bằng HuggingFace Datasets
# ======================
data_files = {
    "train": "/content/train-fixed.csv",
    "validation": "/content/valid-fixed.csv",  # warmup
    "test": "/content/test-fixed.csv"
}

raw_dataset = load_dataset("csv", data_files=data_files)

# ======================
# Bước 3: Gộp train + warmup, chia 85/15
# ======================
merged = concatenate_datasets([raw_dataset["train"], raw_dataset["validation"]])
split_dataset = merged.train_test_split(test_size=0.15, seed=42)

# ======================
# Bước 4: DatasetDict cuối cùng
# ======================
dataset = DatasetDict({
    "train": split_dataset["train"],
    "validation": split_dataset["test"],
    "test": raw_dataset["test"]
})

print(dataset)




Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'prompt', 'response', 'label', 'predict_label'],
        num_rows: 6109
    })
    validation: Dataset({
        features: ['id', 'context', 'prompt', 'response', 'label', 'predict_label'],
        num_rows: 1079
    })
    test: Dataset({
        features: ['id', 'context', 'prompt', 'response', 'label', 'predict_label'],
        num_rows: 1000
    })
})


In [None]:
import pandas as pd

# Lấy lại dữ liệu dạng pandas để kiểm tra
for split in ["train", "validation", "test"]:
    df = dataset[split].to_pandas()

    # Đếm số ô trống trong 3 cột
    missing_counts = df[["context", "prompt", "response"]].isnull().sum()

    print(f"===> Split: {split}")
    print(missing_counts)
    print("-" * 50)

    # Nếu muốn xem luôn các dòng có giá trị trống:
    empty_rows = df[df[["context", "prompt", "response"]].isnull().any(axis=1)]
    if not empty_rows.empty:
        print(f"Các dòng bị thiếu trong {split}:")
        print(empty_rows.head())  # in thử 5 dòng đầu


===> Split: train
context     0
prompt      0
response    0
dtype: int64
--------------------------------------------------
===> Split: validation
context     0
prompt      0
response    0
dtype: int64
--------------------------------------------------
===> Split: test
context     0
prompt      0
response    0
dtype: int64
--------------------------------------------------


In [None]:
model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
def preprocess(example):
    premise = example["context"] + " " + example["prompt"]
    hypothesis = example["response"]
    inputs = tokenizer(
        premise,
        hypothesis,
        truncation=True,
        max_length=1024,
        padding="max_length"
    )
    # chỉ thêm labels nếu có nhãn
    if example.get("label") is not None:
        inputs["labels"] = label2id[example["label"]]
    return inputs

encoded_dataset = dataset.map(preprocess, batched=False)

# với test, không có 'labels' nên ta giữ lại input_ids, attention_mask
keep_cols = ["input_ids", "attention_mask", "labels"]
columns_to_remove = {
    split: [col for col in encoded_dataset[split].column_names if col not in keep_cols]
    for split in encoded_dataset.keys()
}

for split in encoded_dataset.keys():
    # nếu split không có 'labels' (ví dụ test) thì bỏ nó khỏi keep list
    if "labels" not in encoded_dataset[split].column_names:
        cols_to_remove = [c for c in encoded_dataset[split].column_names if c not in ["input_ids","attention_mask"]]
    else:
        cols_to_remove = columns_to_remove[split]
    encoded_dataset[split] = encoded_dataset[split].remove_columns(cols_to_remove)


Map:   0%|          | 0/6109 [00:00<?, ? examples/s]

Map:   0%|          | 0/1079 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
for split in encoded_dataset.keys():
    if "labels" in encoded_dataset[split].column_names:
        encoded_dataset[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    else:
        encoded_dataset[split].set_format(type="torch", columns=["input_ids", "attention_mask"])


In [None]:
!pip install wandb
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
wnb_token  = user_secrets.get_secret("wandb")
wnb_name = 'hallu1'
wandb.login(key=wnb_token)
wandb.init(name=wnb_name)



  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhaduong058a[0m ([33mhaduong058a-hcmussh-edu-vn[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from transformers import EarlyStoppingCallback


In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

training_args = TrainingArguments(
    output_dir="/content/vihallu_model_mdeberta_nli",
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="/content/logs",
    report_to="wandb",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


In [None]:
trainer.train()
results = trainer.evaluate(encoded_dataset["test"])
print(results)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.759318,0.673772,0.668307
200,No log,0.665984,0.752549,0.755337
300,No log,0.604062,0.774791,0.773664
400,No log,0.610241,0.7924,0.792504
500,0.694700,0.625476,0.78962,0.790489
600,0.694700,0.596727,0.795181,0.795301
700,0.694700,0.583876,0.7924,0.793317
800,0.694700,0.660923,0.800741,0.799695
900,0.694700,0.68784,0.791474,0.791113
1000,0.455400,0.633416,0.799815,0.799327


early stopping required metric_for_best_model, but did not find eval_f1 so early stopping is disabled


{'eval_runtime': 15.3004, 'eval_samples_per_second': 65.358, 'eval_steps_per_second': 32.679, 'epoch': 2.8798690671031095}


In [None]:
import pandas as pd

pub_test = pd.read_csv("/content/test.csv")

def preprocess_warmup(example):
    premise = example["context"] + " " + example["prompt"]
    hypothesis = example["response"]
    inputs = tokenizer(
        premise,
        hypothesis,
        truncation=True,
        max_length=1024,
        padding="max_length"
    )
    return inputs

warmup_dataset = Dataset.from_pandas(pub_test)
encoded_warmup = warmup_dataset.map(preprocess_warmup, batched=False)

encoded_warmup = encoded_warmup.remove_columns(
    [col for col in encoded_warmup.column_names if col not in ["input_ids", "attention_mask", "id"]]
)

# ====== predict ======
predictions = trainer.predict(encoded_warmup)
preds = predictions.predictions.argmax(-1)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
id2label = {0: "no", 1: "intrinsic", 2: "extrinsic"}
pred_labels = [id2label[p] for p in preds]
output_df = pd.DataFrame({
    "id": pub_test["id"],
    "predict_label": pred_labels
})
output_df.to_csv("/content/submission_5e.csv", index=False)