In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import os, math, json
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import torch

In [46]:
PATH = "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/AnnotatedData/en-annotated.tsv"  # change to your file
df = pd.read_csv(PATH, sep="\t", header=None)
df = df.rename(columns={0: "text", 1: "label_str"})
print(df.head())
assert {"text","label_str"}.issubset(df.columns), "Expect columns: text,label_str"

# Build the label space from the file (supports labels like 0,1,2 or 1,3,7, etc.)
def parse_labels(s):
    if pd.isna(s) or str(s).strip()=="":
        return []
    return [int(x.strip()) for x in str(s).split(",") if x.strip()!=""]

all_labels = set()
for s in df["label_str"]:
    all_labels.update(parse_labels(s))
# Make labels contiguous and stable
idx2lbl = sorted(all_labels)                 # e.g. [0,1,2,3]
lbl2col = {lbl:i for i,lbl in enumerate(idx2lbl)}
num_labels = len(idx2lbl)

def to_multi_hot(s):
    vec = np.zeros(num_labels, dtype=np.float32)
    for lbl in parse_labels(s):
        if lbl in lbl2col: vec[lbl2col[lbl]] = 1.0
    return vec

df["labels"] = df["label_str"].apply(to_multi_hot)

# Optional: train/val/test split
train_df = df.sample(frac=0.8, random_state=42)
tmp_df = df.drop(train_df.index)
valid_df = tmp_df.sample(frac=0.5, random_state=42)
test_df  = tmp_df.drop(valid_df.index)

ds = DatasetDict({
    "train": Dataset.from_pandas(train_df[["text","labels"]], preserve_index=False),
    "validation": Dataset.from_pandas(valid_df[["text","labels"]], preserve_index=False),
    "test": Dataset.from_pandas(test_df[["text","labels"]], preserve_index=False),
})


                                                text label_str
0                                              , ...         1
1                                                  !   1, 4, 7
2  ... And I don't think we need to discuss the T...      8, 1
3                        * So get up out of your bed         1
4  A confession that you hired [PERSON] ... and a...      1, 6


In [47]:
MODEL = "distilbert-base-uncased"  # pick any encoder model you like
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tok(batch):
    return tokenizer(batch["text"], truncation=True)

ds = ds.map(tok, batched=True, remove_columns=["text"])
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/14022 [00:00<?, ? examples/s]

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

In [51]:


model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=num_labels,
    problem_type="multi_label_classification",  # ensures BCEWithLogitsLoss
    id2label={i:str(lbl) for i,lbl in enumerate(idx2lbl)},
    label2id={str(lbl):i for i,lbl in enumerate(idx2lbl)}
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import numpy as np

THRESH = 0.5  # or tune on val set

def compute_metrics(eval_pred):
    logits, y_true = eval_pred                       # y_true is float for loss
    probs = 1/(1+np.exp(-logits))
    y_pred = (probs >= THRESH).astype(int)
    y_true = y_true.astype(int)                      # <- cast for metrics

    metrics = {
        "f1_micro":       f1_score(y_true, y_pred, average="micro", zero_division=0),
        "f1_macro":       f1_score(y_true, y_pred, average="macro", zero_division=0),
        "precision_micro":precision_score(y_true, y_pred, average="micro", zero_division=0),
        "recall_micro":   recall_score(y_true, y_pred, average="micro", zero_division=0),
        "precision_macro":precision_score(y_true, y_pred, average="macro", zero_division=0),
        "recall_macro":   recall_score(y_true, y_pred, average="macro", zero_division=0),
    }
    # AUC needs probabilities (and at least one positive per class to work)
    try:
        metrics["roc_auc_macro"] = roc_auc_score(y_true, probs, average="macro")
    except ValueError:
        pass
    return metrics


In [54]:
args = TrainingArguments(
    output_dir="mlc-run",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.05,
    num_train_epochs=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    greater_is_better=True,
    logging_steps=50,
    #fp16=torch.cuda.is_available(),
    report_to="none"
)

def to_torch(batch):
    batch["labels"] = torch.tensor(batch["labels"])
    return batch

ds = ds.map(to_torch)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Map:   0%|          | 0/14022 [00:00<?, ? examples/s]

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

Map:   0%|          | 0/1753 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro,Precision Macro,Recall Macro,Roc Auc Macro
1,0.3418,0.335291,0.373154,0.341623,0.661731,0.259839,0.676889,0.247971,0.825339
2,0.303,0.325143,0.461407,0.453675,0.639461,0.360912,0.635957,0.359078,0.834745
3,0.2725,0.328854,0.485447,0.482016,0.602386,0.40653,0.599958,0.405942,0.835529
4,0.2385,0.333854,0.495929,0.49275,0.600891,0.422182,0.599542,0.41973,0.833646


TrainOutput(global_step=3508, training_loss=0.3019787242706173, metrics={'train_runtime': 315.1035, 'train_samples_per_second': 177.999, 'train_steps_per_second': 11.133, 'total_flos': 374679623317440.0, 'train_loss': 0.3019787242706173, 'epoch': 4.0})

In [19]:
ID2NAME = ["neutral", "anger", "anticipation", "disgust", "fear", "joy", "sadness", "surprise", "trust"]

In [55]:
print(trainer.evaluate(ds["test"]))

# ----------------------------
# 7) Inference helper
# ----------------------------
def predict(sentences, threshold=0.5):
    model.eval()
    # choose the model’s device (robust even if it's DataParallel)
    try:
        device = next(model.parameters()).device
    except StopIteration:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    enc = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}  # <<< move to same device

    with torch.inference_mode():
        logits = model(**enc).logits
        probs = torch.sigmoid(logits).detach().cpu().numpy()

    pred_cols = (probs >= threshold).astype(int)
    col2lbl = {i: lbl for i, lbl in enumerate(idx2lbl)}
    decoded = [[col2lbl[i] for i, v in enumerate(row) if v == 1] for row in pred_cols]
    return decoded, probs

examples = [
    "Very nice!",
    "Why would you do that!?",
    "Hello.",
    "You shall not pass!",
    "WHY?!!?"

]
pred_labels, pred_probs = predict(examples, threshold=0.5)
pred_labels = [[ID2NAME[int(label)] for label in pred] for pred in pred_labels]
print(list(zip(examples, pred_labels)))

{'eval_loss': 0.33711275458335876, 'eval_f1_micro': 0.5002602811035919, 'eval_f1_macro': 0.4941339195003052, 'eval_precision_micro': 0.5987538940809969, 'eval_recall_micro': 0.4295932051855163, 'eval_precision_macro': 0.5955045721730345, 'eval_recall_macro': 0.4253088598625193, 'eval_roc_auc_macro': 0.8299548683771205, 'eval_runtime': 1.5827, 'eval_samples_per_second': 1107.614, 'eval_steps_per_second': 34.751, 'epoch': 4.0}
[('Very nice!', ['joy']), ('Why would you do that!?', ['anger', 'surprise']), ('Hello.', ['anticipation']), ('You shall not pass!', ['anger']), ('WHY?!!?', ['surprise'])]
