In [1]:
from google.colab import drive
drive.mount("/content/drive")

# 1. Import libs
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from torch.amp import autocast, GradScaler
from huggingface_hub import login

Mounted at /content/drive


In [None]:
# Login HF
from dotenv import load_dotenv

load_dotenv()  # Load biến môi trường từ file .env

HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN is None:
    raise ValueError("HF_TOKEN chưa được thiết lập trong biến môi trường hoặc file .env!")

# 2. Path Setup
BASE_DIR   = "/content/drive/MyDrive/Colab Notebooks/data-science-challenge-competition"
DATA_DIR   = os.path.join(BASE_DIR, "data")
MODEL_DIR  = os.path.join(BASE_DIR, "model")
RESULT_DIR = os.path.join(BASE_DIR, "result")

train_path  = os.path.join(DATA_DIR, "vihallu-train.csv")
test_path   = os.path.join(DATA_DIR, "vihallu-public-test.csv")
best_path   = os.path.join(MODEL_DIR, "best_cafebase_nli.pt")
submit_path = os.path.join(RESULT_DIR, "submit.csv")


In [3]:
# 3. Load Data
df = pd.read_csv(train_path)

# Encode label
label2id = {l:i for i,l in enumerate(df["label"].unique())}
id2label = {i:l for l,i in label2id.items()}
df["label"] = df["label"].map(label2id)

# Split: 70/15/15
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df   = train_test_split(test_df, test_size=0.5, random_state=42)

print(train_df.shape, val_df.shape, test_df.shape)

(4900, 5) (1050, 5) (1050, 5)


In [4]:
tokenizer = AutoTokenizer.from_pretrained("uitnlp/CafeBERT")

class ResponseDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256, label2id=None, use_markers=True):
        self.df = dataframe.fillna({"context": "", "prompt": "", "response": ""})
        self.tok = tokenizer
        self.max_len = max_len
        self.label2id = label2id
        self.use_markers = use_markers

    def __len__(self):
        return len(self.df)

    def _make_premise(self, context):
        return f"<CONTEXT> {context} </CONTEXT>" if self.use_markers else context

    def _make_hypothesis(self, prompt, response):
        if self.use_markers:
            return f"<PROMPT> {prompt} </PROMPT> <RESPONSE> {response} </RESPONSE>"
        return f"{prompt} {response}"

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        context, prompt, response = str(row["context"]), str(row["prompt"]), str(row["response"])
        label = int(row["label"])

        premise    = self._make_premise(context)
        hypothesis = self._make_hypothesis(prompt, response)

        enc = self.tok(
            text=premise,
            text_pair=hypothesis,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True
        )

        item = {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "labels": label
        }
        if "token_type_ids" in enc:
            item["token_type_ids"] = enc["token_type_ids"]
        return item

# Dataset + Dataloader
train_dataset = ResponseDataset(train_df, tokenizer, max_len=512, label2id=label2id)
val_dataset   = ResponseDataset(val_df, tokenizer, max_len=512, label2id=label2id)
test_dataset  = ResponseDataset(test_df, tokenizer, max_len=512, label2id=label2id)

collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collator)
val_loader   = DataLoader(val_dataset, batch_size=8, collate_fn=collator)
test_loader  = DataLoader(test_dataset, batch_size=8, collate_fn=collator)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [5]:
def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).float()
    return (last_hidden_state * mask).sum(1) / mask.sum(1).clamp(min=1e-9)

class CafeBERTNLIClassifier(nn.Module):
    def __init__(self, num_labels=3):
        super().__init__()
        self.base = AutoModel.from_pretrained("uitnlp/CafeBERT")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.base.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.base(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)
        x = mean_pool(outputs.last_hidden_state, attention_mask)
        x = self.dropout(x)
        return self.fc(x)

specials = ["<PROMPT>", "</PROMPT>", "<CONTEXT>", "</CONTEXT>", "<RESPONSE>", "</RESPONSE>"]
num_added = tokenizer.add_special_tokens({"additional_special_tokens": specials})

model = CafeBERTNLIClassifier(num_labels=len(label2id))
if num_added > 0:
    model.base.resize_token_embeddings(len(tokenizer))

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at uitnlp/CafeBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
EPOCHS = 10
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1*total_steps),
                                            num_training_steps=total_steps)

scaler = GradScaler("cuda", enabled=torch.cuda.is_available())
best_f1, patience, patience_counter = 0.0, 3, 0

def evaluate(model, loader, criterion, device, id2label=None):
    model.eval()
    total_loss, preds, trues = 0.0, [], []
    with torch.no_grad():
        for batch in loader:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch.get("token_type_ids", None)
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            preds.extend(logits.argmax(dim=1).cpu().numpy().tolist())
            trues.extend(labels.cpu().numpy().tolist())

    avg_loss = total_loss / max(1, len(loader))
    f1 = f1_score(trues, preds, average="macro")
    print(classification_report(trues, preds, target_names=[id2label[i] for i in sorted(id2label)]))
    return avg_loss, f1

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad(set_to_none=True)
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch.get("token_type_ids", None)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.to(device)
        labels = batch["labels"].to(device)

        with autocast("cuda", enabled=torch.cuda.is_available()):
            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_loss += loss.item()

    train_loss = total_loss / len(train_loader)
    val_loss, val_f1 = evaluate(model, val_loader, criterion, device, id2label=id2label)

    print(f"Epoch {epoch}/{EPOCHS} | Train Loss {train_loss:.4f} | Val F1 {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        torch.save({"state_dict": model.state_dict(), "label2id": label2id}, best_path)
        print(">>> Saved best model")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(">>> Early stopping!")
            break

              precision    recall  f1-score   support

   extrinsic       0.65      0.86      0.74       355
          no       0.73      0.86      0.79       348
   intrinsic       0.69      0.34      0.46       347

    accuracy                           0.69      1050
   macro avg       0.69      0.69      0.66      1050
weighted avg       0.69      0.69      0.66      1050

Epoch 1/10 | Train Loss 1.0469 | Val F1 0.6629
>>> Saved best model
              precision    recall  f1-score   support

   extrinsic       0.79      0.79      0.79       355
          no       0.80      0.86      0.83       348
   intrinsic       0.73      0.67      0.70       347

    accuracy                           0.78      1050
   macro avg       0.77      0.78      0.77      1050
weighted avg       0.77      0.78      0.77      1050

Epoch 2/10 | Train Loss 0.7538 | Val F1 0.7731
>>> Saved best model
              precision    recall  f1-score   support

   extrinsic       0.83      0.60      0.70    

In [7]:
ckpt = torch.load(best_path, map_location=device)
model.load_state_dict(ckpt["state_dict"])
label2id = ckpt["label2id"]
id2label = {v: k for k, v in label2id.items()}

model.eval()

df_input = pd.read_csv(test_path)
pred_labels = []

for _, row in df_input.iterrows():
    premise    = f"<CONTEXT> {row.get('context','')} </CONTEXT>"
    hypothesis = f"<PROMPT> {row.get('prompt','')} </PROMPT> <RESPONSE> {row.get('response','')} </RESPONSE>"

    enc = tokenizer(text=premise,
                    text_pair=hypothesis,
                    truncation=True,
                    max_length=512,
                    return_token_type_ids=True)

    input_ids      = torch.tensor([enc["input_ids"]], device=device)
    attention_mask = torch.tensor([enc["attention_mask"]], device=device)
    token_type_ids = torch.tensor([enc["token_type_ids"]], device=device) if "token_type_ids" in enc else None

    with torch.no_grad():
        logits = model(input_ids, attention_mask, token_type_ids)
        pred_id = torch.argmax(logits, dim=1).item()
        pred_labels.append(id2label[pred_id])

df_output = pd.DataFrame({"id": df_input["id"], "predict_label": pred_labels})
df_output.to_csv(submit_path, index=False)
print(f"Kết quả đã lưu tại {submit_path}")

Kết quả đã lưu tại /content/drive/MyDrive/Colab Notebooks/data-science-challenge-competition/result/submit.csv
