<a href="https://colab.research.google.com/github/ma55530/SemEval2026-CLARITY-FER/blob/main/multi_task/multi_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import os
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, accuracy_score
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments

In [2]:
os.environ["WANDB_DISABLED"] = "true"

In [15]:
import torch.nn.functional as F

class MultiTaskModel(nn.Module):
    def __init__(self, transformer_model, num_clarity=3, num_evasion=9):
        super(MultiTaskModel, self).__init__()
        self.transformer = transformer_model

        # Classification and Sentiment heads
        self.clarity_head = nn.Linear(self.transformer.config.hidden_size, num_clarity)
        self.evasion_head = nn.Linear(self.transformer.config.hidden_size, num_evasion)

    def forward(self, input_ids, attention_mask, labels_clarity=None, labels_evasion=None, **kwargs):
        # Pass through transformer
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = transformer_output.last_hidden_state[:, 0, :]  # CLS token

        # Outputs for each task
        clarity_logits = self.clarity_head(cls_embedding)
        evasion_logits = self.evasion_head(cls_embedding)

        # Calculate losses if labels are provided
        loss = None
        if labels_clarity is not None and labels_evasion is not None:
            # Loss for clarity
            loss_clarity = F.cross_entropy(clarity_logits, labels_clarity)

            # Loss for evasion, handling placeholder -1 for test set
            valid_evasion_indices = (labels_evasion != -1)
            if valid_evasion_indices.any():
                loss_evasion = F.cross_entropy(evasion_logits[valid_evasion_indices], labels_evasion[valid_evasion_indices])
                # Combine losses, typically with some weighting
                loss = loss_clarity + loss_evasion
            else:
                # If no valid evasion labels (e.g., during full test set evaluation if all are -1),
                # only use clarity loss.
                loss = loss_clarity

        # The Trainer expects a specific return format.
        # If loss is calculated, it should be the first element.
        # Then, the model outputs for metrics.
        if loss is not None:
            # Return (loss, (clarity_logits, evasion_logits))
            return (loss, (clarity_logits, evasion_logits))
        else:
            # During prediction/evaluation without labels, return just the logits.
            return (clarity_logits, evasion_logits)

In [4]:
print("Loading dataset...")
df = load_dataset("ailsntua/QEvasion")

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/259k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3448 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/308 [00:00<?, ? examples/s]

In [17]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
transformer_model = AutoModel.from_pretrained("distilbert-base-uncased")
model = MultiTaskModel(transformer_model, num_clarity=3, num_evasion=9)

In [6]:
print(df["train"].column_names)

['title', 'date', 'president', 'url', 'question_order', 'interview_question', 'interview_answer', 'gpt3.5_summary', 'gpt3.5_prediction', 'question', 'annotator_id', 'annotator1', 'annotator2', 'annotator3', 'inaudible', 'multiple_questions', 'affirmative_questions', 'index', 'clarity_label', 'evasion_label']


In [7]:
clarity2id = {
    "Clear Reply": 0,
    "Ambivalent": 1,
    "Clear Non-Reply": 2,
}

evasion2id = {
    "Claims ignorance": 0,
    "Clarification": 1,
    "Declining to answer": 2,
    "Deflection": 3,
    "Dodging": 4,
    "Explicit": 5,
    "General": 6,
    "Implicit": 7,
    "Partial/half-answer": 8,
}


In [8]:
def tokenize_clarity(example):
    encoded = tokenizer(
        example["interview_question"] +
        " | Answer: " + example["interview_answer"],
        padding="max_length",
        max_length=256,
        truncation=True,
    )
    encoded["labels"] = torch.tensor(clarity2id[example["clarity_label"]])
    return encoded

def tokenize_evasion_train(example):
    encoded = tokenizer(
        example["interview_question"] +
        " | Answer: " + example["interview_answer"],
        padding="max_length",
        max_length=256,
        truncation=True,
    )
    encoded["labels"] = torch.tensor(evasion2id[example["evasion_label"]])
    return encoded

def tokenize_evasion_test(example):
    # This function is not used directly for tokenization + labels anymore in the new setup
    # but keep it for consistency if other parts might reference it.
    encoded = tokenizer(
        example["interview_question"] + " | Answer: " + example["interview_answer"],
        padding="max_length",
        max_length=256,
        truncation=True,
    )
    return encoded

def get_evasion_annotator_labels(example):
    labels_evasion_annotators = [-1, -1, -1] # Default placeholder for 3 annotators
    valid_count = 0
    for ann_key in ["annotator1", "annotator2", "annotator3"]:
        label_str = example.get(ann_key)
        if label_str and label_str in evasion2id:
            labels_evasion_annotators[valid_count] = evasion2id[label_str]
            valid_count += 1
    return {"labels_evasion_annotators": torch.tensor(labels_evasion_annotators, dtype=torch.long)}

In [9]:
# 1. Tokeniziraj
clarity_tokenized_train = df["train"].map(tokenize_clarity)
clarity_tokenized_test  = df["test"].map(tokenize_clarity)
evasion_tokenized_train = df["train"].map(tokenize_evasion_train)

# Prepare multi-annotator labels for the test set
evasion_annotators_test_mapped = df["test"].map(get_evasion_annotator_labels, remove_columns=df["test"].column_names)

# 2. Rename labels -> labels_clarity / labels_evasion
clarity_tokenized_train = clarity_tokenized_train.rename_column("labels", "labels_clarity")
clarity_tokenized_test  = clarity_tokenized_test.rename_column("labels", "labels_clarity")
evasion_tokenized_train = evasion_tokenized_train.rename_column("labels", "labels_evasion")

# 3. Pretvori evasion u int i spoji u clarity dataset
evasion_labels_train = [int(x) for x in evasion_tokenized_train["labels_evasion"]]
train_dataset = clarity_tokenized_train.add_column("labels_evasion", evasion_labels_train)

# Test dataset (placeholder -1 for model's labels_evasion, and add annotator labels for compute_metrics)
test_dataset = clarity_tokenized_test.add_column("labels_evasion", [-1]*len(clarity_tokenized_test))
test_dataset = test_dataset.add_column("labels_evasion_annotators", evasion_annotators_test_mapped["labels_evasion_annotators"])

# 4. Tek sada set format
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels_clarity", "labels_evasion"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels_clarity", "labels_evasion", "labels_evasion_annotators"])

Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

Map:   0%|          | 0/3448 [00:00<?, ? examples/s]

Map:   0%|          | 0/308 [00:00<?, ? examples/s]

In [10]:
print(train_dataset[0])

{'input_ids': tensor([  101,  1053,  1012,  1997,  1996,  7226,  2368,  3447,  1012,  1998,
         5496,  1996,  2142,  2163,  1997,  4820,  2859,  2096,  6183,  2005,
         8041,  7566,  1012,  2129,  2052,  2017,  6869,  2000,  2008,  1029,
         1998,  2079,  2017,  2228,  2343,  8418,  2003,  2108, 18006,  2055,
         2893,  1996,  3276,  2067,  2006,  2650,  2004,  2002,  7221,  2015,
         6207,  1999,  2859,  1029,  1064,  3437,  1024,  2092,  1010,  2298,
         1010,  2034,  1997,  2035,  1010,  1996,  2072,  2572, 18006,  2055,
         2893,  1996,  3276,  2157,  1012,  1998,  2028,  1997,  1996,  2477,
         2008,  2003,  2183,  2006,  2085,  2003,  1010,  2859,  2003,  2927,
         2000,  2689,  2070,  1997,  1996,  3513,  1997,  1996,  2208,  1010,
         1999,  3408,  1997,  3119,  1998,  2060,  3314,  1012,  1998,  2061,
         2028,  1997,  1996,  2477,  2057,  5720,  2055,  1010,  2005,  2742,
         1010,  2003,  2008,  2027,  1005,  2128, 

In [11]:
def multitask_collator(batch):
    collated_batch = {
        "input_ids": torch.stack([x["input_ids"] for x in batch]),
        "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        "labels_clarity": torch.stack([x["labels_clarity"] for x in batch]),
    }

    # Add labels_evasion if present (for train_dataset and placeholder in test_dataset)
    if "labels_evasion" in batch[0]:
        collated_batch["labels_evasion"] = torch.stack([x["labels_evasion"] for x in batch])

    # Add labels_evasion_annotators if present (only for test_dataset)
    if "labels_evasion_annotators" in batch[0]:
        collated_batch["labels_evasion_annotators"] = torch.stack([x["labels_evasion_annotators"] for x in batch])

    return collated_batch

In [23]:
def compute_metrics(eval_pred):
    clarity_logits, evasion_logits = eval_pred.predictions

    # Unpack label_ids: clarity labels, the -1 placeholder labels, and the annotator labels
    labels_clarity_tensor, labels_evasion_placeholder_tensor, labels_evasion_annotators_tensor = eval_pred.label_ids

    labels_clarity = labels_clarity_tensor
    labels_evasion_annotators = labels_evasion_annotators_tensor

    clarity_preds = np.argmax(clarity_logits, axis=1)
    evasion_preds = np.argmax(evasion_logits, axis=1)

    # --- Clarity Metrics ---
    metrics = {
        "clarity_acc": accuracy_score(labels_clarity, clarity_preds),
        "clarity_f1": f1_score(labels_clarity, clarity_preds, average="macro"),
    }

    # --- Evasion Metrics (using multi-annotator logic) ---
    evasion_correct_predictions = 0
    total_evasion_samples_with_annotators = 0

    true_labels_for_f1_evasion = []
    pred_labels_for_f1_evasion = []

    for i in range(len(evasion_preds)):
        pred = evasion_preds[i]
        # Get valid annotator labels for this sample (excluding -1 placeholders)
        valid_annotators_for_sample = labels_evasion_annotators[i][labels_evasion_annotators[i] != -1]

        if len(valid_annotators_for_sample) > 0:
            total_evasion_samples_with_annotators += 1
            # Check if prediction matches ANY valid annotator
            if pred in valid_annotators_for_sample:
                evasion_correct_predictions += 1

            # For F1, use the first valid annotator as the 'true' label for that sample.
            # This is a simplification for calculating F1 in a multi-annotator scenario.
            true_labels_for_f1_evasion.append(valid_annotators_for_sample[0])
            pred_labels_for_f1_evasion.append(pred)

    if total_evasion_samples_with_annotators > 0:
        evasion_accuracy = evasion_correct_predictions / total_evasion_samples_with_annotators
        evasion_f1 = f1_score(true_labels_for_f1_evasion, pred_labels_for_f1_evasion, average="macro")
        metrics.update({
            "evasion_acc": evasion_accuracy,
            "evasion_f1": evasion_f1
        })
    else:
        metrics.update({
            "evasion_acc": 0.0,
            "evasion_f1": 0.0
        })

    return metrics

In [24]:
args = TrainingArguments(
    output_dir="qevasion_multitask",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_steps=50,
    save_strategy="epoch",
    remove_unused_columns=False,
    label_names=["labels_clarity", "labels_evasion", "labels_evasion_annotators"]
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=multitask_collator,
    compute_metrics=compute_metrics
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Clarity Acc,Clarity F1,Evasion Acc,Evasion F1
1,1.847,0.825972,0.571429,0.496779,0.396104,0.249299
2,1.9363,0.810029,0.594156,0.539064,0.383117,0.234168
3,1.6724,0.784225,0.655844,0.565843,0.415584,0.279715


TrainOutput(global_step=648, training_loss=1.8528862234986858, metrics={'train_runtime': 247.5071, 'train_samples_per_second': 41.793, 'train_steps_per_second': 2.618, 'total_flos': 0.0, 'train_loss': 1.8528862234986858, 'epoch': 3.0})