In [None]:
!pip install evaluate
import os, math, json
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import torch



In [None]:
!pip install -U transformers huggingface_hub

Collecting huggingface_hub
  Using cached huggingface_hub-1.1.2-py3-none-any.whl.metadata (13 kB)


Select which languages to use.

In [None]:
SRC = {
    #"en": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/AnnotatedData/en-annotated.tsv",
    #"fi": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/AnnotatedData/fi-annotated.tsv",
    "nl": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/Projections/nl-projections.tsv",
    #"fr": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/Projections/fr-projections.tsv",

}

emotions_mapping = {
    1: "anger",
    2: "anticipation",
    3: "disgust",
    4: "fear",
    5: "joy",
    6: "sadness",
    7: "surprise",
    8: "trust",
}

Load the dataset(s) from repo

In [None]:
def load_lang(url, lang):
    df = pd.read_csv(url, sep="\t", header=None, names=["text", "labels"])
    df["lang"] = lang
    return df

dfs = [load_lang(url, lang) for lang, url in SRC.items()]
df = pd.concat(dfs, ignore_index=True)

df["text"] = df["text"].astype(str).str.strip()
df["labels"] = df["labels"].fillna("").astype(str).str.strip()

# Parse "1, 6" -> [1,6]
def parse_ids(s):
    if not s:
        return []
    return [int(x.strip()) for x in s.split(",") if x.strip().isdigit()]

df["label_ids"] = df["labels"].apply(parse_ids)

Some data preperation, inlcuding normalization and cleaning of text.

In [None]:
import re
import unicodedata

URL_RE = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
EMAIL_RE = re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b')

def basic_clean(text: str):
    if pd.isna(text):
        return ""
    # Unicode normalize
    x = unicodedata.normalize("NFKC", str(text))

    # Replace urls/emails with placeholders
    x = URL_RE.sub(" <URL> ", x)
    x = EMAIL_RE.sub(" <EMAIL> ", x)

    return x

df["text"] = df["text"].astype(str).map(lambda t: basic_clean(t))

Drop duplicate rows

In [None]:
def drop_exact_dupes(df):
    # Exact duplicate text with same label/language -> keep first
    return df.drop_duplicates(subset=["text", "labels", "lang"])

drop_exact_dupes(df)

Unnamed: 0,text,labels,lang,label_ids
0,"VS-SCHIP FORRESTAL, OOSTELIJK MIDDELLANDSE-ZEE...",4,nl,[4]
1,Mayday !,4,nl,[4]
2,Dit is de VS-marinehelikopter.,"4, 8",nl,"[4, 8]"
3,Nightingale 501.,"2, 5, 8",nl,"[2, 5, 8]"
4,We nemen de positie in om een duiker op uw dek...,8,nl,[8]
...,...,...,...,...
5329,Borden bleef je privacy schenden.,1,nl,[1]
5330,"Ik wil feesten, kun je me helpen?",2,nl,[2]
5331,Hoe kwam dat geld op naam van Jane?,7,nl,[7]
5332,Dat zoeken we uit. Hopelijk leidt het ons naar...,8,nl,[8]


shift indexes. Apply multihot to labels

In [None]:
all_ids = sorted(emotions_mapping.keys())  # 1..8
id2idx = {lab_id: i for i, lab_id in enumerate(all_ids)}  # 1->0, 2->1, ...
idx2id = {i: lab_id for lab_id, i in id2idx.items()}
idx2name = {i: emotions_mapping[idx2id[i]] for i in range(len(all_ids))}
num_labels = len(all_ids)

def to_multihot(ids):
    v = np.zeros(num_labels, dtype=np.float32)
    for lab_id in ids:
        if lab_id in id2idx:
            v[id2idx[lab_id]] = 1.0
    return v.tolist()

df["multihot"] = df["label_ids"].apply(to_multihot)

# Convert to HF dataset
dataset = Dataset.from_pandas(df[["text", "multihot", "lang"]], preserve_index=False).rename_columns({"multihot":"labels"})

Tokenize the data to obtain longest token. We take 95 percentile for max length.

In [None]:
MODEL = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

df["token_length"] = df["text"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))

print("Max token length:", df["token_length"].max())
print("Average token length:", df["token_length"].mean())
print(df["token_length"].describe())

max_length = int(np.percentile(df["token_length"], 95))
print("Max length (95% percentile):", max_length)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Max token length: 62
Average token length: 11.61304836895388
count    5334.000000
mean       11.613048
std         5.505760
min         3.000000
25%         8.000000
50%        10.000000
75%        14.000000
max        62.000000
Name: token_length, dtype: float64
Max length (95% percentile): 22


70-15-15 train val test split.

In [None]:
SEED = 123
ds_tv = dataset.train_test_split(test_size=0.3, seed=SEED)
ds = DatasetDict(
    train=ds_tv["train"],
    validation=ds_tv["test"].train_test_split(test_size=0.5, seed=SEED)["train"],
    test=ds_tv["test"].train_test_split(test_size=0.5, seed=SEED)["test"]
)

print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'lang'],
        num_rows: 3733
    })
    validation: Dataset({
        features: ['text', 'labels', 'lang'],
        num_rows: 800
    })
    test: Dataset({
        features: ['text', 'labels', 'lang'],
        num_rows: 801
    })
})


Create tokenizer using HF we obtain the standard optimizer

In [None]:
model_name = "xlm-roberta-base"
tok = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tok(batch["text"], truncation=True, padding=False, max_length=max_length)

ds = ds.map(tokenize, batched=True, remove_columns=["text", "lang"])

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)


Map:   0%|          | 0/3733 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/801 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import f1_score, average_precision_score, precision_recall_fscore_support

Metrics for evaluation of the model. Including F1 scores, Jaccard, mAP

In [None]:
def sigmoid(x): return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    preds = (probs >= 0.5).astype(int)
    micro = f1_score(labels, preds, average="micro", zero_division=0)
    macro = f1_score(labels, preds, average="macro", zero_division=0)
    try:
        mAP = average_precision_score(labels, probs, average="macro")
    except ValueError:
        mAP = float("nan")

    # Jaccard

    intersection = np.logical_and(labels == 1, preds == 1).sum(axis=1)
    union = np.logical_or(labels == 1, preds == 1).sum(axis=1)
    # avoid division by zero
    per_sample_jaccard = np.where(union == 0, 1.0, intersection / union)
    jaccard = per_sample_jaccard.mean()

    return {"f1_micro": micro, "f1_macro": macro, "mAP_macro": mAP, "jaccard": jaccard}

Training arguments

In [None]:
args = TrainingArguments(
    output_dir="encoder_only",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=15,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=50,
    report_to="none",
)

Create a trainer

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tok,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


Train the model, and save the best model

In [None]:
trainer.train()
trainer.save_model("/content/drive/MyDrive/nl_bestmodel_final_moreepoch")

Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Map Macro,Jaccard
1,0.4757,0.474543,0.0,0.0,0.249626,0.0
2,0.4593,0.462285,0.0,0.0,0.27745,0.0
3,0.4515,0.449943,0.085627,0.065124,0.308772,0.052292
4,0.4371,0.460405,0.125691,0.079323,0.328722,0.07
5,0.4091,0.445865,0.174157,0.13766,0.341267,0.115146
6,0.3882,0.455728,0.285881,0.234378,0.351356,0.196
7,0.3668,0.454332,0.2966,0.240827,0.365056,0.211562
8,0.3483,0.46599,0.314031,0.264854,0.361059,0.233646
9,0.3267,0.465592,0.324384,0.278327,0.368039,0.246146
10,0.3134,0.474593,0.340494,0.290512,0.366326,0.260667


Compute metrics on held-out test set

In [None]:
from sklearn.metrics import precision_recall_fscore_support

test_logits = trainer.predict(ds["test"]).predictions
test_labels = np.array(ds["test"]["labels"])
test_probs  = 1 / (1 + np.exp(-test_logits))

test_preds = (test_probs >= 0.5).astype(int)

# Precision, recall, F1
p, r, f1, _ = precision_recall_fscore_support(
    test_labels, test_preds, average="micro", zero_division=0
)

# mAP macro
mAP = average_precision_score(test_labels, test_probs, average="macro")

# Jaccard
intersection = np.logical_and(test_labels == 1, test_preds == 1).sum(axis=1)
union = np.logical_or(test_labels == 1, test_preds == 1).sum(axis=1)
per_sample_jaccard = np.where(union == 0, 1.0, intersection / union)
jaccard = per_sample_jaccard.mean()

print({
    "precision_micro": p,
    "recall_micro": r,
    "f1_micro": f1,
    "mAP_macro": mAP,
    "jaccard": jaccard
})


{'precision_micro': 0.46543778801843316, 'recall_micro': 0.0876736111111111, 'f1_micro': 0.14755295836376917, 'mAP_macro': np.float64(0.3236833061504555), 'jaccard': np.float64(0.0945692883895131)}


Predict labels based on user input

In [None]:
def predict(texts):
    batch = tok(texts, truncation=True, padding=True, return_tensors="pt").to(model.device)
    with torch.no_grad():
        logits = model(**batch).logits.cpu().numpy()
    probs = sigmoid(logits)
    thrs = 0.5
    preds = (probs >= thrs).astype(int)
    # Map back to emotion IDs and names
    outputs = []
    for row in preds:
        ids = [idx2id[i] for i, v in enumerate(row) if v == 1]
        names = [emotions_mapping[i] for i in ids]
        outputs.append({"ids": ids, "names": names})
    return outputs, probs


In [None]:
predict("Im waiting..")

([{'ids': [], 'names': []}],
 array([[0.10778446, 0.48648226, 0.02658658, 0.11224986, 0.16179483,
         0.03895107, 0.137575  , 0.15168554]], dtype=float32))