<a href="https://colab.research.google.com/github/ma55530/SemEval2026-CLARITY-FER/blob/main/hierarchiral/BERT(ClearRvsRest)v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Installing the necessary libraries**

In [None]:
%pip -q install scikit-learn torch pandas datasets transformers torchvision accelerate

In [None]:
import pandas as pd
from datasets import Dataset, load_dataset
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments


# **Loading the data**

In [None]:
print("Loading dataset...")
df = load_dataset("ailsntua/QEvasion")

def clarity_to_label(row):
    mapping = {
        "Clear Reply": 0,
        "Ambivalent": 1,
        "Clear Non-Reply": 2
    }
    row["label"] = mapping[row["clarity_label"]]

    # --- FIXED SHORT/LONG BINARY LABEL ---
    # This line was incorrect: row["binary_label"] = 0 if (row["clarity_label"] == 0 or row["clarity_label"] == 1) else 1
    # It should compare with the assigned integer label:
    row["binary_label"] = 0 if row["label"] == 0 else 1
    return row

df = df.map(clarity_to_label)
y_test = df["test"]["label"]


non_reply_lengths = [len(ans.split()) for ans, label in zip(df["train"]["interview_answer"], df["train"]["label"]) if label == 2]
other_lengths     = [len(ans.split()) for ans, label in zip(df["train"]["interview_answer"], df["train"]["label"]) if label != 2]

print("Mean length Clear Non-Reply:", np.mean(non_reply_lengths))
print("Mean length Others (Reply + Ambivalent):", np.mean(other_lengths))

print("Count Non-reply:", len(non_reply_lengths))
print("Count Others:", len(other_lengths))



Loading dataset...
Mean length Clear Non-Reply: 137.8061797752809
Mean length Others (Reply + Ambivalent): 311.506468305304
Count Non-reply: 356
Count Others: 3092


# **Defining the evaluation**

In [None]:
def evaluate(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
    plt.show()

def compute_metrics_binary(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "precision": precision_score(labels, preds, average='binary', zero_division=0),
        "recall": recall_score(labels, preds, average='binary', zero_division=0),
        "f1": f1_score(labels, preds, average='binary', zero_division=0)
    }

def compute_metrics_fine(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "precision": precision_score(labels, preds, average='macro', zero_division=0),
        "recall": recall_score(labels, preds, average='macro', zero_division=0),
        "f1": f1_score(labels, preds, average='macro', zero_division=0)
    }


# **Loading the model and tokenizer**

In [None]:


binary_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

fine_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=3
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    encoded = tokenizer(
        "Question: " + example["interview_question"] +
        "Answer: " + example["interview_answer"],
        padding="max_length",
        max_length=256,
        truncation=True
    )
    encoded["label"] = example["label"]
    encoded["binary_label"] = example["binary_label"]
    return encoded

tokenized_train = df["train"].map(tokenize)
tokenized_test = df["test"].map(tokenize)

cols_to_remove = ["interview_question", "interview_answer", "clarity_label"]

tokenized_train = tokenized_train.remove_columns(cols_to_remove)
tokenized_test  = tokenized_test.remove_columns(cols_to_remove)

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

# **Preparing the data for training**

In [None]:
# -----------------------------
# PREPARE BINARY DATA
# -----------------------------
binary_train = tokenized_train.remove_columns(["label"]).rename_column("binary_label", "labels")
binary_test  = tokenized_test.remove_columns(["label"]).rename_column("binary_label", "labels")

print("Binary label distribution (train):", set(int(x) for x in binary_train["labels"]))
print("Binary label distribution (test):",  set(int(x) for x in binary_test["labels"]))


# -----------------------------
# NOTE: Fine model data will be prepared AFTER binary training
# -----------------------------
print("Fine model data will be prepared after binary training using binary model predictions")
print("Final binary sizes:", len(binary_train), len(binary_test))


Binary label distribution (train): {0, 1}
Binary label distribution (test): {0, 1}
Fine model data will be prepared after binary training using binary model predictions
Final binary sizes: 3448 308


# **Defining the evaluation function and training**

In [None]:
os.environ["WANDB_DISABLED"] = "true"

class WeightedTrainer(Trainer):
    def __init__(self, weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.weights = weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss(weight=self.weights.to(model.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



trainer_binary = WeightedTrainer(
    weights=torch.tensor([1.0, 1.0]),
    model=binary_model,
    train_dataset=binary_train,
    eval_dataset=binary_test,
    compute_metrics=compute_metrics_binary,
    args=TrainingArguments(
        output_dir="binary_output",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        report_to="none"
    ),
)
trainer_binary.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.546568,0.760943,0.9869,0.859316
2,No log,0.543532,0.802281,0.921397,0.857724
3,0.568200,0.546794,0.8,0.908297,0.850716


TrainOutput(global_step=648, training_loss=0.5469124169997227, metrics={'train_runtime': 331.776, 'train_samples_per_second': 31.178, 'train_steps_per_second': 1.953, 'total_flos': 1360810378321920.0, 'train_loss': 0.5469124169997227, 'epoch': 3.0})

In [None]:
# -----------------------------
# PREPARE FINE DATA USING BINARY MODEL OUTPUT
# -----------------------------
import numpy as np

print("Creating fine model data using binary model predictions...")

# 1) Get binary predictions for train and test
out_train = trainer_binary.predict(binary_train)
binary_preds_train = np.argmax(out_train.predictions, axis=-1)

out_test = trainer_binary.predict(binary_test)
binary_preds_test = np.argmax(out_test.predictions, axis=-1)

# 2) Select indices that binary model predicted as Others (1) AND have original label 1 or 2
selected_train_idx = list(range(len(tokenized_train)))

selected_test_idx = [
    i for i, p in enumerate(binary_preds_test)
    if p == 1
]

# 3) Create fine datasets from tokenized data using selected indices
train_fine = tokenized_train.select(selected_train_idx)
test_fine = tokenized_test.select(selected_test_idx)

# Remove binary_label column as fine model only needs 1 vs 2
train_fine = train_fine.remove_columns(["binary_label"])
test_fine = test_fine.remove_columns(["binary_label"])

# Rename label → labels
train_fine = train_fine.rename_column("label", "labels")
test_fine = test_fine.rename_column("label", "labels")




Creating fine model data using binary model predictions...


In [None]:
trainer_fine = Trainer(
    model=fine_model,
    train_dataset=train_fine,
    eval_dataset=test_fine,
    compute_metrics=compute_metrics_fine,
    args=TrainingArguments(
        output_dir="fine_output_unweighted",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        report_to="none"
    ),
)

trainer_fine.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.765452,0.529884,0.412149,0.394545
2,No log,0.730113,0.532631,0.50828,0.46798
3,0.824700,0.717076,0.546201,0.514255,0.502269


KeyboardInterrupt: 

In [None]:
def classify_two_stage(example):
    with torch.no_grad():
        logits = binary_model(**example).logits
    binary_pred = logits.argmax(dim=-1).item()

    if binary_pred == 0:
        return 0  # Clear Reply

    # Others → fine model
    with torch.no_grad():
        logits = fine_model(**example).logits
    fine_pred = logits.argmax(dim=-1).item()

    # fine: 0=Ambivalent, 1=Non-Reply → convert to original labels 1 and 2
    return 1 if fine_pred == 0 else 2


def plot_cm(trainer, dataset, display_labels):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    ConfusionMatrixDisplay.from_predictions(
        labels, preds, display_labels=display_labels
    )
    plt.show()


# FIXED LABEL ORDER ↓↓↓↓↓
plot_cm(trainer_binary, binary_test,
        ["Clear Reply", "Ambivalent / Non-Reply"])

plot_cm(trainer_fine, test_fine,
        ["Ambivalent", "Clear Non-Reply"])
