In [None]:
# 1. Setup & Imports
from google.colab import drive
drive.mount("/content/drive")

import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from torch.cuda.amp import autocast, GradScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Paths
BASE_DIR   = "/content/drive/MyDrive/Colab Notebooks/data-science-challenge-competition"
DATA_DIR   = os.path.join(BASE_DIR, "data")
MODEL_DIR  = os.path.join(BASE_DIR, "model")
RESULT_DIR = os.path.join(BASE_DIR, "result")
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULT_DIR, exist_ok=True)

best_path = os.path.join(MODEL_DIR, "best_multiinput_cafebert.pt")
submit_path = os.path.join(RESULT_DIR, "submit.csv")

In [None]:
# 2. Load Data (JSONL)
def read_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

train_records = read_jsonl(os.path.join(DATA_DIR, "processed/vihallu-train-split.jsonl"))
val_records   = read_jsonl(os.path.join(DATA_DIR, "processed/vihallu-val-split.jsonl"))
test_records  = read_jsonl(os.path.join(DATA_DIR, "jsonl/vihallu-public-test.jsonl"))

train_df = pd.DataFrame(train_records).fillna({"context":"", "prompt":"", "response":""})
val_df   = pd.DataFrame(val_records).fillna({"context":"", "prompt":"", "response":""})
df_test  = pd.DataFrame(test_records).fillna({"context":"", "prompt":"", "response":""})

# label mapping
labels = sorted(list(set(train_df['label'].unique().tolist() + val_df['label'].unique().tolist())))
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

train_df['label'] = train_df['label'].map(label2id)
val_df['label']   = val_df['label'].map(label2id)

print("Train/Val sizes:", len(train_df), len(val_df))

In [None]:
# 3. Tokenizer
MODEL_NAME = "uitnlp/CafeBERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

specials = ["<CONTEXT>", "</CONTEXT>", "<PROMPT>", "</PROMPT>", "<RESPONSE>", "</RESPONSE>"]
num_added = tokenizer.add_special_tokens({"additional_special_tokens": specials})
print("Added special tokens:", num_added)

In [None]:
# 4. Dataset & Collate
class MultiInputDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return {
            "context": str(row.get("context","")),
            "prompt": str(row.get("prompt","")),
            "response": str(row.get("response","")),
            "label": int(row["label"]) if "label" in row else -1,
            "id": row.get("id", None)
        }

def multiinput_collate(batch, tokenizer,
                       max_len_context=128, max_len_prompt=64, max_len_response=128,
                       use_markers=True):
    contexts, prompts, responses, labels, ids = [], [], [], [], []
    for x in batch:
        ctx, pr, resp = x["context"], x["prompt"], x["response"]
        if use_markers:
            ctx   = f"<CONTEXT> {ctx} </CONTEXT>"
            pr    = f"<PROMPT> {pr} </PROMPT>"
            resp  = f"<RESPONSE> {resp} </RESPONSE>"
        contexts.append(ctx)
        prompts.append(pr)
        responses.append(resp)
        labels.append(x["label"])
        ids.append(x["id"])

    enc_ctx  = tokenizer(contexts, truncation=True, padding=True, max_length=max_len_context, return_tensors="pt")
    enc_pr   = tokenizer(prompts,  truncation=True, padding=True, max_length=max_len_prompt, return_tensors="pt")
    enc_resp = tokenizer(responses,truncation=True, padding=True, max_length=max_len_response, return_tensors="pt")

    return {
        "context_input_ids": enc_ctx["input_ids"],
        "context_attention_mask": enc_ctx["attention_mask"],
        "prompt_input_ids": enc_pr["input_ids"],
        "prompt_attention_mask": enc_pr["attention_mask"],
        "response_input_ids": enc_resp["input_ids"],
        "response_attention_mask": enc_resp["attention_mask"],
        "labels": torch.tensor(labels, dtype=torch.long),
        "ids": ids
    } 

In [None]:

# 5. DataLoader
train_ds = MultiInputDataset(train_df)
val_ds   = MultiInputDataset(val_df)

class_counts = train_df['label'].value_counts().sort_index().values
class_weights = 1.0 / (class_counts + 1e-12)
sample_weights = train_df['label'].map(lambda x: class_weights[x]).values
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

BATCH_TRAIN, BATCH_EVAL = 8, 16
train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, sampler=sampler,
                          collate_fn=lambda b: multiinput_collate(b, tokenizer))
val_loader   = DataLoader(val_ds, batch_size=BATCH_EVAL, shuffle=False,
                          collate_fn=lambda b: multiinput_collate(b, tokenizer))

In [None]:
# 6. Model
class MultiInputCafeBERT(nn.Module):
    def __init__(self, model_name, hidden_dropout=0.3, num_labels=3, tokenizer=None):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        if tokenizer is not None:
            self.encoder.resize_token_embeddings(len(tokenizer))
        hidden_size = self.encoder.config.hidden_size
        self.dropout = nn.Dropout(hidden_dropout)
        self.classifier = nn.Sequential(
            nn.Linear(3*hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(hidden_dropout),
            nn.Linear(hidden_size, num_labels)
        )

    def forward(self, ctx_ids, ctx_mask, pr_ids, pr_mask, resp_ids, resp_mask):
        out_ctx  = self.encoder(input_ids=ctx_ids,  attention_mask=ctx_mask)
        out_pr   = self.encoder(input_ids=pr_ids,  attention_mask=pr_mask)
        out_resp = self.encoder(input_ids=resp_ids,attention_mask=resp_mask)

        cls_ctx  = out_ctx.last_hidden_state[:,0,:]
        cls_pr   = out_pr.last_hidden_state[:,0,:]
        cls_resp = out_resp.last_hidden_state[:,0,:]

        h = torch.cat([cls_ctx, cls_pr, cls_resp], dim=1)
        h = self.dropout(h)
        return self.classifier(h)


num_labels = len(label2id)
model = MultiInputCafeBERT(MODEL_NAME, hidden_dropout=0.3, num_labels=num_labels, tokenizer=tokenizer)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# 7. Loss, Optimizer, Scheduler
inv_freq = class_counts.max() / (class_counts + 1e-12)
weights_tensor = torch.tensor(inv_freq, dtype=torch.float).to(device)

class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None):
        super().__init__()
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss(weight=weight)
    def forward(self, logits, targets):
        ce_loss = self.ce(logits, targets)
        probs = torch.softmax(logits, dim=1)
        pt = probs.gather(1, targets.unsqueeze(1)).squeeze(1)
        loss = ((1 - pt) ** self.gamma) * ce_loss
        return loss

criterion = FocalLoss(gamma=2.0, weight=weights_tensor)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

EPOCHS = 8
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1*total_steps),
                                            num_training_steps=total_steps)

scaler = GradScaler()
best_f1, patience, patience_counter = 0.0, 3, 0

In [None]:
# 8. Evaluate
def evaluate(model, loader, device, criterion=None, id2label=None):
    model.eval()
    all_preds, all_trues = [], []
    total_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            ctx_ids  = batch["context_input_ids"].to(device)
            ctx_mask = batch["context_attention_mask"].to(device)
            pr_ids   = batch["prompt_input_ids"].to(device)
            pr_mask  = batch["prompt_attention_mask"].to(device)
            resp_ids = batch["response_input_ids"].to(device)
            resp_mask= batch["response_attention_mask"].to(device)
            labels   = batch["labels"].to(device)

            logits = model(ctx_ids, ctx_mask, pr_ids, pr_mask, resp_ids, resp_mask)
            if criterion is not None:
                loss = criterion(logits, labels)
                if isinstance(loss, torch.Tensor):
                    total_loss += loss.item() if loss.dim()==0 else loss.mean().item()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds.tolist())
            all_trues.extend(labels.cpu().numpy().tolist())

    f1 = f1_score(all_trues, all_preds, average="macro")
    if id2label is not None:
        print(classification_report(all_trues, all_preds, target_names=[id2label[i] for i in sorted(id2label)]))
        cm = confusion_matrix(all_trues, all_preds)
        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt='d',
                    xticklabels=[id2label[i] for i in sorted(id2label)],
                    yticklabels=[id2label[i] for i in sorted(id2label)])
        plt.xlabel("Predicted"); plt.ylabel("True"); plt.show()
    return total_loss/len(loader), f1

In [None]:
# 9. Training Loop
for epoch in range(1, EPOCHS+1):
    model.train()
    total_train_loss = 0.0
    for step, batch in enumerate(train_loader, start=1):
        ctx_ids  = batch["context_input_ids"].to(device)
        ctx_mask = batch["context_attention_mask"].to(device)
        pr_ids   = batch["prompt_input_ids"].to(device)
        pr_mask  = batch["prompt_attention_mask"].to(device)
        resp_ids = batch["response_input_ids"].to(device)
        resp_mask= batch["response_attention_mask"].to(device)
        labels   = batch["labels"].to(device)

        optimizer.zero_grad(set_to_none=True)
        with autocast():
            logits = model(ctx_ids, ctx_mask, pr_ids, pr_mask, resp_ids, resp_mask)
            loss = criterion(logits, labels)
            if isinstance(loss, torch.Tensor) and loss.dim()>0:
                loss = loss.mean()

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)
    val_loss, val_f1 = evaluate(model, val_loader, device, criterion, id2label=id2label)
    print(f"Epoch {epoch} | Train loss {avg_train_loss:.4f} | Val F1 {val_f1:.4f}")

    if val_f1 > best_f1:
        best_f1, patience_counter = val_f1, 0
        torch.save({"state_dict": model.state_dict(), "label2id": label2id}, best_path)
        print(">>> Saved best model")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(">>> Early stopping")
            break

In [None]:
# 10. Inference on Public Test
ckpt = torch.load(best_path, map_location=device)
model.load_state_dict(ckpt["state_dict"])
model.eval()

test_ds = MultiInputDataset(df_test)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False,
                         collate_fn=lambda b: multiinput_collate(b, tokenizer))

all_preds = []
with torch.no_grad():
    for batch in test_loader:
        ctx_ids  = batch["context_input_ids"].to(device)
        ctx_mask = batch["context_attention_mask"].to(device)
        pr_ids   = batch["prompt_input_ids"].to(device)
        pr_mask  = batch["prompt_attention_mask"].to(device)
        resp_ids = batch["response_input_ids"].to(device)
        resp_mask= batch["response_attention_mask"].to(device)

        logits = model(ctx_ids, ctx_mask, pr_ids, pr_mask, resp_ids, resp_mask)
        preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        all_preds.extend(preds)

pred_labels = [id2label[p] for p in all_preds]
out_df = pd.DataFrame({"id": df_test["id"], "predict_label": pred_labels})
out_df.to_csv(submit_path, index=False)
print("Saved predictions to", submit_path)