In [1]:
import pandas as pd
import torch as t; t.set_grad_enabled(False)
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset, load_dataset, concatenate_datasets
from personality.constants import DATA_PATH, MODEL_PATH


LABEL2ID = {"humor": 0, "sarcasm": 1, "remorse": 2}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

# load train and test prompts
path = f"{DATA_PATH}/wildchat"
train = pd.read_json(f"{path}/train.jsonl", orient="records", lines=True)
test = pd.read_json(f"{path}/test.jsonl", orient="records", lines=True)
train_prompts = train["messages"].apply(lambda x: x[0]["content"]).tolist()
test_prompts = test["messages"].apply(lambda x: x[0]["content"]).tolist()

[2025-05-22 15:57:04,717] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
def eval_dataset(model, tokenizer, test_ds):
    # prepare data in batches
    batch_size = 64
    dataloader = t.utils.data.DataLoader(
        test_ds,
        batch_size=batch_size,
        collate_fn=DataCollatorWithPadding(tokenizer)
    )

    # get predictions
    model.eval()
    all_logits = []
    all_labels = []

    with t.inference_mode():
        for batch in tqdm(dataloader):
            # move batch to GPU if available
            batch = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**batch)
            all_logits.append(outputs.logits)
            all_labels.append(batch["labels"])

    # concatenate all batches
    logits = t.cat(all_logits, dim=0)
    true_labels = t.cat(all_labels, dim=0)
    predicted_labels = t.argmax(logits, dim=-1)

    # calculate F1 scores per class
    f1_metric = evaluate.load("f1")
    class_ids = sorted(list(ID2LABEL.keys()))

    f1_scores = f1_metric.compute(
        predictions=predicted_labels.cpu().numpy(),
        references=true_labels.cpu().numpy(),
        average=None,
        labels=class_ids
    )

    print("\nMetrics per class:")
    for i, class_id in enumerate(class_ids):
        class_name = ID2LABEL[class_id]
        f1_score = f1_scores['f1'][i]
        
        # calculate accuracy for this class
        class_mask = (true_labels == class_id)
        class_correct = (predicted_labels[class_mask] == true_labels[class_mask]).sum()
        class_total = class_mask.sum()
        accuracy = (class_correct / class_total).item() if class_total > 0 else 0
        
        print(f"{class_name}:")
        print(f"  F1: {f1_score:.4f}")
        print(f"  Accuracy: {accuracy:.4f}")

    # print macro averages
    macro_f1 = f1_scores['f1'].mean()
    macro_acc = ((predicted_labels == true_labels).sum() / len(true_labels)).item()
    print(f"\nMacro Average F1: {macro_f1:.4f}")
    print(f"Overall Accuracy: {macro_acc:.4f}")


def load_personality(
        label: str,
        method: str = "prompting",
        model_name: str = "llama-3.1-8b-it",
        system_prompt_type: str = "short",
        steering_prompt_type: str = "short",
        split: str="test",
        adversarial: bool=False
) -> Dataset:
    suffix = ""
    if method == "prompting":
        suffix = f"-{system_prompt_type}"
    elif method == "steering":
        suffix = f"-{steering_prompt_type}"
    if adversarial: suffix += "-adversarial"
    path = f"{DATA_PATH}/wildchat/{method}/{model_name}/{label}{suffix}.jsonl"
    ds = load_dataset("json", data_files=path, split="train")
    ds = ds.filter(lambda x: x["split"] == split)
    # replace prompt column with prompts list
    ds = ds.remove_columns("prompt")
    ds = ds.add_column("prompt", train_prompts if split == "train" else test_prompts)
    ds = ds.add_column("label", [LABEL2ID[label]] * len(ds))
    return ds

In [3]:
label = "humor"
method = "prompting"
model_name = "llama-3.1-8b-it"
system_prompt_type = "short"
steering_prompt_type = "short" 
split = "train"

In [4]:
# load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    f"{MODEL_PATH}/modernbert-base-classifier-exp",
    torch_dtype=t.bfloat16,
    trust_remote_code=True,
    num_labels=len(LABEL2ID),
    id2label=ID2LABEL,
    label2id=LABEL2ID,
    problem_type="single_label_classification"
).to("cuda")
model.eval()
tokenizer = AutoTokenizer.from_pretrained(f"{MODEL_PATH}/modernbert-base")
def tokenize(element) -> str:
    prompt = element["prompt"]
    completion = element["response"]
    text = f"Human: {prompt}\n\nAssistant: {completion}"
    out = tokenizer(text, truncation=True, max_length=8192)
    out["label"] = element["label"]
    return out

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [5]:
# test dataset
splits = [
    load_personality("humor", "prompting", model_name, system_prompt_type),
    load_personality("sarcasm", "prompting", model_name, system_prompt_type),
    load_personality("remorse", "prompting", model_name, system_prompt_type),
]
ds = concatenate_datasets(splits).shuffle(seed=123456)
cols = [c for c in ds.column_names if c not in ["label"]]
test_ds = ds.map(tokenize, remove_columns=cols)
print("test accuracy for prompted models")
eval_dataset(model, tokenizer, test_ds)

# test dataset
splits = [
    load_personality("humor", "training", model_name, system_prompt_type),
    load_personality("sarcasm", "training", model_name, system_prompt_type),
    load_personality("remorse", "training", model_name, system_prompt_type),
]
ds = concatenate_datasets(splits).shuffle(seed=123456)
cols = [c for c in ds.column_names if c not in ["label"]]
test_ds = ds.map(tokenize, remove_columns=cols)
print("test accuracy for trained models")
eval_dataset(model, tokenizer, test_ds)

test accuracy for prompted models


100%|██████████| 47/47 [00:08<00:00,  5.30it/s]



Metrics per class:
humor:
  F1: 0.6593
  Accuracy: 0.6860
sarcasm:
  F1: 0.7098
  Accuracy: 0.7130
remorse:
  F1: 0.7152
  Accuracy: 0.6830

Macro Average F1: 0.6948
Overall Accuracy: 0.6940


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

test accuracy for trained models


100%|██████████| 47/47 [00:06<00:00,  7.54it/s]



Metrics per class:
humor:
  F1: 0.6702
  Accuracy: 0.6940
sarcasm:
  F1: 0.6208
  Accuracy: 0.5860
remorse:
  F1: 0.7594
  Accuracy: 0.7750

Macro Average F1: 0.6835
Overall Accuracy: 0.6850


In [6]:
# test dataset
splits = [
    load_personality("humor", "prompting", model_name, system_prompt_type, adversarial=True),
    load_personality("sarcasm", "prompting", model_name, system_prompt_type, adversarial=True),
    load_personality("remorse", "prompting", model_name, system_prompt_type, adversarial=True),
]
ds = concatenate_datasets(splits).shuffle(seed=123456)
cols = [c for c in ds.column_names if c not in ["label"]]
test_ds = ds.map(tokenize, remove_columns=cols)
print("adversarial test accuracy for prompted models")
eval_dataset(model, tokenizer, test_ds)

# test dataset
splits = [
    load_personality("humor", "training", model_name, system_prompt_type, adversarial=True),
    load_personality("sarcasm", "training", model_name, system_prompt_type, adversarial=True),
    load_personality("remorse", "training", model_name, system_prompt_type, adversarial=True),
]
ds = concatenate_datasets(splits).shuffle(seed=123456)
cols = [c for c in ds.column_names if c not in ["label"]]
test_ds = ds.map(tokenize, remove_columns=cols)
print("adversarial test accuracy for trained models")
eval_dataset(model, tokenizer, test_ds)

adversarial test accuracy for prompted models


100%|██████████| 47/47 [00:06<00:00,  6.96it/s]



Metrics per class:
humor:
  F1: 0.5096
  Accuracy: 0.6470
sarcasm:
  F1: 0.3744
  Accuracy: 0.3370
remorse:
  F1: 0.4238
  Accuracy: 0.3520

Macro Average F1: 0.4360
Overall Accuracy: 0.4453
adversarial test accuracy for trained models


100%|██████████| 47/47 [00:05<00:00,  8.38it/s]



Metrics per class:
humor:
  F1: 0.6179
  Accuracy: 0.6510
sarcasm:
  F1: 0.6075
  Accuracy: 0.6370
remorse:
  F1: 0.6715
  Accuracy: 0.6030

Macro Average F1: 0.6323
Overall Accuracy: 0.6303
