<a href="https://colab.research.google.com/github/ma55530/SemEval2026-CLARITY-FER/blob/main/hierarchiral/deberta-v3-base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing the necessary libraries**

In [25]:
pip -q install scikit-learn torch pandas datasets transformers torchvision accelerate

In [26]:
import pandas as pd
from datasets import Dataset, load_dataset
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments


# **Loading the data**

In [27]:
print("Loading dataset...")
df = load_dataset("ailsntua/QEvasion")

def clarity_to_label(row):
    mapping = {
        "Clear Reply": 0,
        "Ambivalent": 1,
        "Clear Non-Reply": 2
    }
    row["label"] = mapping[row["clarity_label"]]

    # --- FIXED SHORT/LONG BINARY LABEL ---
    # This line was incorrect: row["binary_label"] = 0 if (row["clarity_label"] == 0 or row["clarity_label"] == 1) else 1
    # It should compare with the assigned integer label:
    row["binary_label"] = 0 if row["label"] == 0 else 1
    return row

df = df.map(clarity_to_label)
y_test = df["test"]["label"]


non_reply_lengths = [len(ans.split()) for ans, label in zip(df["train"]["interview_answer"], df["train"]["label"]) if label == 2]
other_lengths     = [len(ans.split()) for ans, label in zip(df["train"]["interview_answer"], df["train"]["label"]) if label != 2]

print("Mean length Clear Non-Reply:", np.mean(non_reply_lengths))
print("Mean length Others (Reply + Ambivalent):", np.mean(other_lengths))

print("Count Non-reply:", len(non_reply_lengths))
print("Count Others:", len(other_lengths))



Loading dataset...
Mean length Clear Non-Reply: 137.8061797752809
Mean length Others (Reply + Ambivalent): 311.506468305304
Count Non-reply: 356
Count Others: 3092


# **Defining the evaluation**

In [28]:
def evaluate(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
    plt.show()

def compute_metrics_binary(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "precision": precision_score(labels, preds, average='binary', zero_division=0),
        "recall": recall_score(labels, preds, average='binary', zero_division=0),
        "f1": f1_score(labels, preds, average='binary', zero_division=0)
    }

def compute_metrics_fine(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "precision": precision_score(labels, preds, average='macro', zero_division=0),
        "recall": recall_score(labels, preds, average='macro', zero_division=0),
        "f1": f1_score(labels, preds, average='macro', zero_division=0)
    }


# **Loading the model and tokenizer**

In [29]:


binary_model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base", num_labels=2
)

fine_model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v3-base", num_labels=2
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

def tokenize(example):
    encoded = tokenizer(
        example["interview_question"] +
        "Answer: " + example["interview_answer"],
        padding="max_length",
        max_length=256,
        truncation=True
    )
    encoded["label"] = example["label"]
    encoded["binary_label"] = example["binary_label"]
    return encoded

tokenized_train = df["train"].map(tokenize)
tokenized_test = df["test"].map(tokenize)

cols_to_remove = ["interview_question", "interview_answer", "clarity_label"]

tokenized_train = tokenized_train.remove_columns(cols_to_remove)
tokenized_test  = tokenized_test.remove_columns(cols_to_remove)

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")



Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Preparing the data for training**

In [30]:
# -----------------------------
# PREPARE BINARY DATA
# -----------------------------
binary_train = tokenized_train.remove_columns(["label"]).rename_column("binary_label", "labels")
binary_test  = tokenized_test.remove_columns(["label"]).rename_column("binary_label", "labels")

print("Binary label distribution (train):", set(int(x) for x in binary_train["labels"]))
print("Binary label distribution (test):",  set(int(x) for x in binary_test["labels"]))


# -----------------------------
# FILTER FOR FINE MODEL
# -----------------------------
train_fine = tokenized_train.filter(
    lambda ex: ex["binary_label"] == 1 and ex["label"] in [1, 2]
)
test_fine = tokenized_test.filter(
    lambda ex: ex["binary_label"] == 1 and ex["label"] in [1, 2]
)

# Remove columns not needed for fine
train_fine = train_fine.remove_columns(["binary_label"])
test_fine  = test_fine.remove_columns(["binary_label"])

# Rename label → labels
train_fine = train_fine.rename_column("label", "labels")
test_fine  = test_fine.rename_column("label", "labels")

# DEBUG before remap (should be {1,2})
unique_labels = set(int(x) for x in train_fine["labels"])
print("DEBUG unique fine labels (before remap):", unique_labels)

assert unique_labels <= {1,2}, f"Fine dataset must contain only 1 and 2 before remap, got: {unique_labels}"


# -----------------------------
# REMAP labels {1,2} → {0,1}
# -----------------------------
def remap_fn(ex):
    return {"labels": 0 if ex["labels"] == 1 else 1}

train_fine = train_fine.map(remap_fn)
test_fine  = test_fine.map(remap_fn)

# DEBUG after remap
unique_after = set(int(x) for x in train_fine["labels"])
print("DEBUG after remap:", unique_after)

assert unique_after == {0,1}, "Fine remapped labels must be {0,1}!"

print("Final binary sizes:", len(binary_train), len(binary_test))
print("Final fine sizes:", len(train_fine), len(test_fine))


Binary label distribution (train): {0, 1}
Binary label distribution (test): {0, 1}


Filter:   0%|          | 0/3448 [00:00<?, ? examples/s]

Filter:   0%|          | 0/308 [00:00<?, ? examples/s]

DEBUG unique fine labels (before remap): {1, 2}


Map:   0%|          | 0/2396 [00:00<?, ? examples/s]

Map:   0%|          | 0/229 [00:00<?, ? examples/s]

DEBUG after remap: {0, 1}
Final binary sizes: 3448 308
Final fine sizes: 2396 229


# **Defining the evaluation function and training**

In [31]:
os.environ["WANDB_DISABLED"] = "true"

class WeightedTrainer(Trainer):
    def __init__(self, weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.weights = weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss(weight=self.weights.to(model.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



trainer_binary = WeightedTrainer(
    weights=torch.tensor([1.0, 1.0]),
    model=binary_model,
    train_dataset=binary_train,
    eval_dataset=binary_test,
    compute_metrics=compute_metrics_binary,
    args=TrainingArguments(
        output_dir="binary_output",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        report_to="none"
    ),
)
trainer_binary.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.554681,0.743506,1.0,0.852886
2,No log,0.526804,0.803774,0.930131,0.862348
3,0.561700,0.549352,0.836449,0.781659,0.808126


TrainOutput(global_step=648, training_loss=0.5325335749873409, metrics={'train_runtime': 386.6013, 'train_samples_per_second': 26.756, 'train_steps_per_second': 1.676, 'total_flos': 1360834782879744.0, 'train_loss': 0.5325335749873409, 'epoch': 3.0})

In [None]:
trainer_fine = Trainer(
    model=fine_model,
    train_dataset=train_fine,
    eval_dataset=test_fine,
    compute_metrics=compute_metrics_fine,
    args=TrainingArguments(
        output_dir="fine_output_unweighted",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        report_to="none"
    ),
)

trainer_fine.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.306916,0.449782,0.5,0.473563
2,No log,0.249085,0.690588,0.70515,0.697459


In [None]:
def classify_two_stage(example):
    with torch.no_grad():
        logits = binary_model(**example).logits
    binary_pred = logits.argmax(dim=-1).item()

    if binary_pred == 0:
        return 0  # Clear Reply

    # Others → fine model
    with torch.no_grad():
        logits = fine_model(**example).logits
    fine_pred = logits.argmax(dim=-1).item()

    # fine: 0=Ambivalent, 1=Non-Reply → convert to original labels 1 and 2
    return 1 if fine_pred == 0 else 2


def plot_cm(trainer, dataset, display_labels):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    ConfusionMatrixDisplay.from_predictions(
        labels, preds, display_labels=display_labels
    )
    plt.show()


# FIXED LABEL ORDER ↓↓↓↓↓
plot_cm(trainer_binary, binary_test,
        ["Clear Reply", "Ambivalent / Non-Reply"])

plot_cm(trainer_fine, test_fine,
        ["Ambivalent", "Clear Non-Reply"])
