In [1]:
SRC = {
    "en": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/AnnotatedData/en-annotated.tsv",
    "fi": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/AnnotatedData/fi-annotated.tsv",
    "nl": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/Projections/nl-projections.tsv",
    "fr": "https://raw.githubusercontent.com/niek-alexander/LLM-Project/refs/heads/main/XED-master/Projections/fr-projections.tsv",
}

emotions_mapping = {
    1: "anger",
    2: "anticipation",
    3: "disgust",
    4: "fear",
    5: "joy",
    6: "sadness",
    7: "surprise",
    8: "trust",
}


In [3]:
import re
import unicodedata
import pandas as pd

URL_RE = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
EMAIL_RE = re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b')

def basic_clean(text: str):
    if pd.isna(text):
        return ""
    # Unicode normalize
    x = unicodedata.normalize("NFKC", str(text))

    # Replace urls/emails/mentions with placeholders
    x = URL_RE.sub(" <URL> ", x)
    x = EMAIL_RE.sub(" <EMAIL> ", x)

    return x

def drop_exact_dupes(df):
    return df.drop_duplicates(subset=["text", "labels", "lang"])

In [4]:
def load_data(lang):
    df = pd.read_csv(SRC[lang], sep="\t", header=None, names=["text", "labels"])
    df["text"] = df["text"].astype(str).map(basic_clean)
    df["emotion"] = df["labels"].apply(lambda x: [emotions_mapping[int(label)] for label in x.split(',') if int(label) in emotions_mapping])
    df["lang"] = lang
    return df

datasets = {lang: load_data(lang).sample(frac=1, random_state=42).reset_index(drop=True) for lang in SRC}
multi_lang_df = pd.concat(datasets.values(), ignore_index=True)

In [5]:
!pip install transformers[sentencepiece] --quiet
from transformers import pipeline

In [6]:
model_name = "google/flan-t5-xl"
generator = pipeline("text2text-generation", model=model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [7]:
def zero_shot_prompt(text):
    return f"""
    What emotions are expressed in the text?
    Answer with one or more of:
    anger, anticipation, disgust, fear, joy, sadness, surprise, trust


Text: "{text}"
Answer:"""

In [8]:
def few_shot_prompt(text, examples):
    examples_str = "\n\n".join([
        f"Text: \"{ex}\"\nAnswer: {lab}" for ex, lab in examples
    ])
    return f"""
Examples:
{examples_str}

What emotions are expressed in the next text?
Classify the emotions expressed in the following text with one or more of:
anger, anticipation, disgust, fear, joy, sadness, surprise, trust
Text: "{text}"
Answer:"""

In [9]:
def instruction_prompt(text):
    return f"""You are an emotion classifier.
Analyze the following text and identify all emotions present.
Use ONLY these labels: anger, anticipation, disgust, fear, joy, sadness, surprise, trust.
If multiple emotions apply, list them separated by commas and do not use a period at the end of the output.

Text: {text}
Emotions:"""

In [10]:
prompt_strategies = {
    "zero_shot": zero_shot_prompt,
    "few_shot": few_shot_prompt,
    "instruction": instruction_prompt,
}

In [11]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings( #silence global warnings, meant for testing small sets
    "ignore",
    category=UndefinedMetricWarning,
    message="F-score is ill-defined.*"
)

from sklearn.metrics import f1_score, classification_report

def parse_output(output_text):
    output_text = output_text.strip().lower()
    if output_text == "none" or output_text == "":
        return []
    return [label.strip() for label in output_text.split(",") if label.strip() in emotions_mapping.values()]

def generate_predictions(df, model_pipeline, prompt_fn):
    predictions = []
    for text in df['text']:
        prompt = prompt_fn(text)
        output = model_pipeline(prompt, do_sample=False)[0]['generated_text']
        predictions.append(parse_output(output))
    return predictions

def evaluate(df, predictions):
    all_labels = list(emotions_mapping.values())

    def binarize(labels):
        return [1 if label in labels else 0 for label in all_labels]

    y_true_bin = [binarize(labels) for labels in df['emotion']]
    y_pred_bin = [binarize(labels) for labels in predictions]

    f1_micro = f1_score(y_true_bin, y_pred_bin, average='micro')
    f1_macro = f1_score(y_true_bin, y_pred_bin, average='macro')
    report = classification_report(y_true_bin, y_pred_bin, target_names=all_labels, zero_division=0)

    return {"f1_micro": f1_micro, "f1_macro": f1_macro, "report": report}

In [13]:
results = {}
n_zero_shot = 200
n_few_shot= 200
n_instr_shot = 200
n_few_len = 4

for lang, df in datasets.items():
    # Zero-shot:
    df_zero = df.head(n_zero_shot).copy()
    zero_preds = []
    for text in df_zero['text']:
        prompt = zero_shot_prompt(text)
        output = generator(prompt, do_sample=False, max_new_tokens=65)[0]['generated_text']
        zero_preds.append(parse_output(output))

    zero_results = evaluate(df_zero, zero_preds)
    print(f"{lang.upper()} Zero-shot ({n_zero_shot}): F1 Micro: {zero_results['f1_micro']:.3f} | F1 Macro: {zero_results['f1_macro']:.3f}")

# Few-shot:
example_texts = df.iloc[n_zero_shot:n_zero_shot+n_few_len]['text'].tolist()
example_labels = df.iloc[n_zero_shot:n_zero_shot+n_few_len]['emotion'].apply(lambda x: ", ".join(x) if x else "none").tolist()
examples = list(zip(example_texts, example_labels))

for lang, df in datasets.items():
    df_few = df.iloc[n_zero_shot+n_few_len + 1:n_zero_shot+n_few_len + 1 + n_few_shot].copy().reset_index(drop=True)
    few_preds = []
    for i, text in enumerate(df_few['text']):
        prompt = few_shot_prompt(text, examples)
        output = generator(prompt, do_sample=False, max_new_tokens=65)[0]['generated_text']
        few_preds.append(parse_output(output))

    few_results = evaluate(df_few, few_preds)
    print(f"{lang.upper()} Few-shot ({n_few_shot}):   F1 Micro: {few_results['f1_micro']:.3f} | F1 Macro: {few_results['f1_macro']:.3f}")

if lang not in results:
    results[lang] = {}
    results[lang]['zero_shot'] = zero_results
    results[lang]['few_shot'] = few_results

for lang, df in datasets.items():
  df_instr = df.iloc[n_zero_shot+n_few_shot+n_few_len + 1:n_few_shot + n_zero_shot+n_few_len + 1 + n_instr_shot].copy().reset_index(drop=True)
  instr_preds = []
  for i, text in enumerate(df_instr['text']):
      prompt = instruction_prompt(text)
      output = generator(prompt, do_sample=False, max_new_tokens=65)[0]['generated_text']
      instr_preds.append(parse_output(output))

  instr_results = evaluate(df_instr, instr_preds)
  print(f"{lang.upper()} instr-shot ({n_instr_shot}): F1 Micro: {instr_results['f1_micro']:.3f} | F1 Macro: {instr_results['f1_macro']:.3f}")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


EN Zero-shot (200): F1 Micro: 0.534 | F1 Macro: 0.494
FI Zero-shot (200): F1 Micro: 0.299 | F1 Macro: 0.251
NL Zero-shot (200): F1 Micro: 0.248 | F1 Macro: 0.215
FR Zero-shot (200): F1 Micro: 0.313 | F1 Macro: 0.294
EN Few-shot (200):   F1 Micro: 0.491 | F1 Macro: 0.436
FI Few-shot (200):   F1 Micro: 0.345 | F1 Macro: 0.309
NL Few-shot (200):   F1 Micro: 0.264 | F1 Macro: 0.235
FR Few-shot (200):   F1 Micro: 0.346 | F1 Macro: 0.307
EN instr-shot (200): F1 Micro: 0.412 | F1 Macro: 0.415
FI instr-shot (200): F1 Micro: 0.286 | F1 Macro: 0.267
NL instr-shot (200): F1 Micro: 0.293 | F1 Macro: 0.268
FR instr-shot (200): F1 Micro: 0.332 | F1 Macro: 0.309


In [14]:
multi_lang_df = multi_lang_df.sample(frac=1, random_state=42).reset_index(drop=True)


multi_zero = multi_lang_df.head(n_zero_shot).copy()
fewshot_candidates = multi_lang_df.iloc[n_zero_shot + n_few_shot : ]
fewshot_examples_df = fewshot_candidates.sample(n=n_few_len, random_state=42)


multi_zero_preds = [
    parse_output(generator(zero_shot_prompt(t), do_sample=False, max_new_tokens=65)[0]['generated_text'])
    for t in multi_zero['text']
]
multi_zero_res = evaluate(multi_zero, multi_zero_preds)
print(f"MULTI Zero-shot ({n_zero_shot}): "
      f"F1 Micro: {multi_zero_res['f1_micro']:.3f} | F1 Macro: {multi_zero_res['f1_macro']:.3f}")

# Few-shot examples
multi_ex_texts = fewshot_examples_df['text'].tolist()
multi_ex_labels = fewshot_examples_df['emotion'] \
    .apply(lambda x: ", ".join(x) if x else "none").tolist()

multi_examples = list(zip(multi_ex_texts, multi_ex_labels))

# Few-shot evaluation
eval_start = n_zero_shot
eval_end   = eval_start + n_few_shot
multi_few = multi_lang_df.iloc[eval_start:eval_end].copy().reset_index(drop=True)

multi_few_preds = [
    parse_output(
        generator(few_shot_prompt(text, multi_examples), do_sample=False, max_new_tokens=65)[0]['generated_text']
    )
    for text in multi_few['text']
]

multi_few_res = evaluate(multi_few, multi_few_preds)
print(f"MULTI Few-shot ({n_few_shot}): "
      f"F1 Micro: {multi_few_res['f1_micro']:.3f} | F1 Macro: {multi_few_res['f1_macro']:.3f}")

# Instruction-prompt evaluation (same slice)
multi_instr_preds = [
    parse_output(generator(instruction_prompt(t), do_sample=False, max_new_tokens=65)[0]['generated_text'])
    for t in multi_few['text']
]
multi_instr_res = evaluate(multi_few, multi_instr_preds)
print(f"MULTI Instruction ({n_instr_shot}): "
      f"F1 Micro: {multi_instr_res['f1_micro']:.3f} | F1 Macro: {multi_instr_res['f1_macro']:.3f}")

results['multi'] = {
    'zero_shot'   : multi_zero_res,
    'few_shot'    : multi_few_res,
    'instruction' : multi_instr_res
}

MULTI Zero-shot (200): F1 Micro: 0.355 | F1 Macro: 0.315
MULTI Few-shot (200): F1 Micro: 0.339 | F1 Macro: 0.308
MULTI Instruction (200): F1 Micro: 0.334 | F1 Macro: 0.331

All three strategies (zero-shot, few-shot, instruction) completed for every language + MULTI!
