# **Installing the necessary libraries**

In [1]:
pip -q install scikit-learn torch pandas datasets transformers torchvision accelerate

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from datasets import Dataset, load_dataset
import os
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments


False
0


AssertionError: Torch not compiled with CUDA enabled

# **Loading the data**

In [3]:
print("Loading dataset...")
df = load_dataset("ailsntua/QEvasion")

def clarity_to_label(row) :
  mapping = {
      "Clear Reply": 0,
      "Ambivalent": 1,
      "Clear Non-Reply": 2
  }
  row["label"] = mapping[row["clarity_label"]]
  row["binary_label"] = 1 if row["clarity_label"] == "Clear Non-Reply" else 0
  return row

df = df.map(clarity_to_label)
y_test = df["test"]["label"]




Loading dataset...


# **Defining the evaluation**

In [4]:
def evaluate(y_true, y_pred):
    print(classification_report(y_true, y_pred))
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="macro", zero_division=0)
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred)
    plt.show()
    return precision, recall, f1

def compute_metrics_binary(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    p = precision_score(labels, preds, average='binary', zero_division=0)
    r = recall_score(labels, preds, average='binary', zero_division=0)
    f1 = f1_score(labels, preds, average='binary', zero_division=0)
    return {"precision": p, "recall": r, "f1": f1}

def compute_metrics_fine(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    p = precision_score(labels, preds, average='macro', zero_division=0)
    r = recall_score(labels, preds, average='macro', zero_division=0)
    f1 = f1_score(labels, preds, average='macro', zero_division=0)
    return {"precision": p, "recall": r, "f1": f1}

# **Loading the model and tokenizer**

In [5]:


binary_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
fine_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def tokenize(example):
    out = tokenizer(
        "Question: " + example["interview_question"] + " Answer: " + example["interview_answer"],
        padding="max_length",
        max_length=512,
        truncation=True
    )
    out["label"] = example["label"]
    out["binary_label"] = example["binary_label"]
    return out

tokenized_train = df["train"].map(tokenize)
tokenized_test = df["test"].map(tokenize)

tokenized_train = tokenized_train.remove_columns(["interview_question", "interview_answer", "clarity_label"])
tokenized_test = tokenized_test.remove_columns(["interview_question", "interview_answer", "clarity_label"])

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Preparing the data for training**

In [6]:

# Binary datasets
binary_train = tokenized_train.remove_columns(["label"]).rename_column("binary_label", "labels")
binary_test = tokenized_test.remove_columns(["label"]).rename_column("binary_label", "labels")

# Fine-grained datasets (only long replies)
train_fine = tokenized_train.filter(lambda x: x["binary_label"] == 0).rename_column("label", "labels")
test_fine = tokenized_test.filter(lambda x: x["binary_label"] == 0).rename_column("label", "labels")


labels_np = np.array(df["train"]["label"])
class_weights_np = compute_class_weight("balanced", classes=np.unique(labels_np), y=labels_np)
class_weights = torch.tensor(class_weights_np, dtype=torch.float)
print("Class weights (3-class):", class_weights)



Class weights (3-class): tensor([1.0925, 0.5634, 3.2285])


# **Defining the evaluation function and training**

In [7]:
os.environ["WANDB_DISABLED"] = "true"  # Disable logging to wandb

class WeightedTrainer(Trainer):
    def __init__(self, weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.weights = weights.to(self.model.device) if weights is not None else None

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.weights) if self.weights is not None else torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss
    
trainer_binary = WeightedTrainer(
    weights=torch.tensor([1.0, 1.0]),  # binary classes weighted equally
    model=binary_model,
    train_dataset=binary_train,
    eval_dataset=binary_test,
    compute_metrics=compute_metrics_binary,
    args=TrainingArguments(
        output_dir="binary_output",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        report_to="none"
    ),
)
trainer_binary.train()

fine_class_weights = class_weights[0:2] * 2  # example weighting: Clear Reply + Ambivalent

trainer_fine = WeightedTrainer(
    weights=fine_class_weights,
    model=fine_model,
    train_dataset=train_fine,
    eval_dataset=test_fine,
    compute_metrics=compute_metrics_fine,
    args=TrainingArguments(
        output_dir="fine_output",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        report_to="none"
    ),
)
trainer_fine.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

def classify_two_stage(example):
    with torch.no_grad():
        logits = binary_model(**example).logits
    binary_pred = logits.argmax(dim=-1).item()
    
    if binary_pred == 1:
        return 2  # Clear Non-Reply
    
    with torch.no_grad():
        logits = fine_model(**example).logits
    fine_pred = logits.argmax(dim=-1).item()
    return fine_pred

def plot_cm(trainer, dataset, display_labels):
    predictions = trainer.predict(dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    ConfusionMatrixDisplay.from_predictions(labels, preds, display_labels=display_labels)
    plt.show()

plot_cm(trainer_binary, binary_test, ["Clear Reply / Ambivalent", "Clear Non-Reply"])
plot_cm(trainer_fine, test_fine, ["Clear Reply", "Ambivalent"])