In [17]:
import os; os.environ["WANDB_DISABLED"] = "true"  # W&B aus (optional)
import pandas as pd, numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

In [18]:
# Two QA-Head (Start/End) and 1 IO-Token-Labeling
# Evaluierung per language (EM/F1), Unanswerable => empty/all O

# Load data and only keep the relevant languages for comutational reasons
langs = ["ar","ko","te"]
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
df_train = df_train[df_train.lang.isin(langs)].reset_index(drop=True)
df_val   = df_val[df_val.lang.isin(langs)].reset_index(drop=True)

# Convert to Huggingface Datasets and only keep relevant columns for computational reasons
train_ds = Dataset.from_pandas(df_train[["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)
val_ds   = Dataset.from_pandas(df_val[["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)

In [19]:
# Prepocessing QA-Head (Start/End-Targets)
def build_preprocess(tokenizer, max_length=384, stride=128):
    no_answer_index = 0
    def preprocess_qa(examples):
        questions = examples["question"]
        contexts  = examples["context"]
        answers   = examples["answer"] if examples["answerable"] else ""
        answer_starts = examples["answer_start"] if examples["answerable"] else -1

        tokenized_examples = tokenizer(
            questions,
            contexts,
            truncation="only_second",
            max_length=max_length,
            stride=stride,
            return_offsets_mapping=True,
            return_token_type_ids=True
        )

        start = end = no_answer_index

        if examples["answerable"]:
            seq_ids = tokenized_examples.sequence_ids()
            offset_mapping = tokenized_examples["offset_mapping"]
            conext_token_indices = [i for i, s in enumerate(seq_ids) if s == 1]
            if len(conext_token_indices) > 0:
                context_start = conext_token_indices[0]
                context_end   = conext_token_indices[-1]
                answer0 = answer_starts
                answer1 = answer_starts + len(answers)
                i = context_start
                while i <= context_end and (offset_mapping[i][0] <= answer0 and offset_mapping[i][1] <= answer0):
                    i += 1
                while i > context_start and offset_mapping[i-1][0] <= answer0 < offset_mapping[i-1][1]:
                    i -= 1
                if context_start <= i <= context_end:
                    j = i
                    while j <= context_end and offset_mapping[j][0] < answer1:
                        j += 1
                    start = i
                    end = min(j-1, context_end)

        tokenized_examples["start_positions"] = start
        tokenized_examples["end_positions"]   = end
        tokenized_examples.pop("offset_mapping", None)

        return tokenized_examples
    return preprocess_qa


In [20]:
### Metrics for QA-Head - span as IO-Token-Labeling (EM/F1 per language)
def compute_metrics_qa(eval_pred):
    start_logits = np.argmax(eval_pred.predictions[0], -1)
    end_logits = np.argmax(eval_pred.predictions[1], -1)
    labels = eval_pred.label_ids

    if isinstance(labels, dict):
        gold_start = labels["start_positions"]
        gold_end = labels["end_positions"]
    elif isinstance(labels, (list, tuple)) and len(labels) == 2:
        gold_start, gold_end = labels
    else:
        gold_start = labels
        gold_end = labels

    def span_to_set(start, end):
        s = int(np.asarray(start).reshape(-1)[0])
        e = int(np.asarray(end).reshape(-1)[0])
        if s == 0 and e == 0:   # CLS => leere Menge (unanswerable)
            return set()
        if e < s:
            return set()
        return set(range(s, e + 1))

    em_list = []
    f1_list = []

    for i,j,k,l in zip(start_logits, end_logits, gold_start, gold_end):
        predicted_tokens = span_to_set(i, j)
        gold_tokens = span_to_set(k, l)
        em_list.append(int(predicted_tokens == gold_tokens))

        if not predicted_tokens and not gold_tokens:
            f1_list.append(1)
        elif not predicted_tokens or not gold_tokens:
            f1_list.append(0)
        else:
            intersection = len(predicted_tokens.intersection(gold_tokens))
            precision = intersection / len(predicted_tokens)
            recall = intersection / len(gold_tokens)
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            f1_list.append(f1)

    return {"exact_match": np.mean(em_list), "f1_token": np.mean(f1_list)}


In [21]:
## Preprocessing Token-Labeling-Head (IO-Targets)
def build_preprocess_token(tokenizer, max_length=384, stride=128):
    def preprocess_token(examples):
        questions = examples["question"]
        contexts  = examples["context"]
        answers   = examples["answer"] if examples["answerable"] else ""
        answer_starts = examples["answer_start"] if examples["answerable"] else -1

        tokenized_examples = tokenizer(
            questions,
            contexts,
            truncation="only_second",
            max_length=max_length,
            stride=stride,
            return_offsets_mapping=True,
            return_token_type_ids=True
        )

        sequence_ids = tokenized_examples.sequence_ids()
        offset_mapping = tokenized_examples["offset_mapping"]
        labels = np.full(len(tokenized_examples["input_ids"]), -100, dtype=int) # ignore question

        context_token_indices = [i for i, s in enumerate(sequence_ids) if s == 1]
        if context_token_indices:
            context_start = context_token_indices[0]
            context_end   = context_token_indices[-1]
            labels[context_start:context_end+1] = 0  # default O-label

            if examples["answerable"]:
                answer0 = answer_starts
                answer1 = answer_starts + len(answers)

                for i in range(context_start, context_end + 1):
                    start, end = offset_mapping[i]

                    if not (end <= answer0 or start >= answer1):
                        labels[i] = 1  # I-label

        del tokenized_examples["offset_mapping"]
        tokenized_examples["labels"] = labels.tolist()
        return tokenized_examples
    return preprocess_token

In [22]:
## Metrics for Token-Labeling-Head (F1 per language)

def compute_metrics_token(eval_pred):
    logits = eval_pred.predictions
    predictions = np.argmax(logits, axis=-1)
    labels = eval_pred.label_ids
    mask = labels != -100  # Ignore index
    true = labels[mask]
    predictions = predictions[mask]
    tp = np.sum((predictions == 1) & (true == 1))
    fp = np.sum((predictions == 1) & (true == 0))
    fn = np.sum((predictions == 0) & (true == 1))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return {"f1_token": f1}

In [23]:
# ==== Train & Evaluate (2x QA-Head + 1x IO-Token-Labeling) ====

from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

MODELS = [
    "google-bert/bert-base-multilingual-cased",      # QA-Head
    "distilbert/distilbert-base-multilingual-cased", # QA-Head
    "xlm-roberta-base",                               # IO-Token-Classifier
]

for i, model_name in enumerate(MODELS):
    print(f"\n=== Training model: {model_name} ===")
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

    data_collator = None
    if i < 2:
        # QA-Head (Start/End) – nutzt build_preprocess (ohne Sliding-Window!)
        prep = build_preprocess(tokenizer)
        train_prep = train_ds.map(prep, remove_columns=train_ds.column_names)
        val_prep   = val_ds.map(prep,   remove_columns=val_ds.column_names)
        model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        compute_metrics = compute_metrics_qa
    else:
        # IO-Token-Labeling – nutzt build_token_labels
        prep = build_preprocess_token(tokenizer)
        train_prep = train_ds.map(prep, remove_columns=train_ds.column_names)
        val_prep   = val_ds.map(prep,   remove_columns=val_ds.column_names)
        model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=2)
        compute_metrics = compute_metrics_token
        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)  # Labels mitpadden

    training_args = TrainingArguments(
        output_dir=f"wk40_{model_name.split('/')[-1]}",
        learning_rate=2e-5,
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        logging_steps=100,
        do_eval=True,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_prep,
        eval_dataset=val_prep,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=data_collator,  # None für QA-Head, spezieller Collator für IO
    )

    trainer.train()

    # --- Evaluation pro Sprache ---
    for lang in langs:
        val_lang = val_ds.filter(lambda ex, L=lang: ex["lang"] == L)
        val_lang_prep = val_lang.map(prep, remove_columns=val_lang.column_names)
        metrics = trainer.evaluate(eval_dataset=val_lang_prep)
        print(f"VAL [{lang}]: {metrics}")



=== Training model: google-bert/bert-base-multilingual-cased ===


Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,3.016
200,1.9375
300,1.8005


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

VAL [ar]: {'eval_loss': 1.4767183065414429, 'eval_exact_match': 0.5012048192771085, 'eval_f1_token': 0.5826509688223139, 'eval_runtime': 6.8167, 'eval_samples_per_second': 60.88, 'eval_steps_per_second': 3.814, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

VAL [ko]: {'eval_loss': 1.4217687845230103, 'eval_exact_match': 0.5, 'eval_f1_token': 0.5873895294829784, 'eval_runtime': 5.8499, 'eval_samples_per_second': 60.856, 'eval_steps_per_second': 3.932, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

VAL [te]: {'eval_loss': 2.128549337387085, 'eval_exact_match': 0.3802083333333333, 'eval_f1_token': 0.4502532817656715, 'eval_runtime': 5.8223, 'eval_samples_per_second': 65.953, 'eval_steps_per_second': 4.122, 'epoch': 1.0}

=== Training model: distilbert/distilbert-base-multilingual-cased ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,3.5258
200,2.6306
300,2.5384


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

VAL [ar]: {'eval_loss': 2.312197208404541, 'eval_exact_match': 0.3349397590361446, 'eval_f1_token': 0.37504114069667344, 'eval_runtime': 3.5332, 'eval_samples_per_second': 117.458, 'eval_steps_per_second': 7.359, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

VAL [ko]: {'eval_loss': 2.3777549266815186, 'eval_exact_match': 0.2640449438202247, 'eval_f1_token': 0.3289931638270697, 'eval_runtime': 2.998, 'eval_samples_per_second': 118.747, 'eval_steps_per_second': 7.672, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

VAL [te]: {'eval_loss': 2.43961238861084, 'eval_exact_match': 0.2760416666666667, 'eval_f1_token': 0.3273516482285099, 'eval_runtime': 3.0425, 'eval_samples_per_second': 126.211, 'eval_steps_per_second': 7.888, 'epoch': 1.0}

=== Training model: xlm-roberta-base ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,0.1515
200,0.1254
300,0.1165


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

VAL [ar]: {'eval_loss': 0.10345359146595001, 'eval_f1_token': 0.07607607607607607, 'eval_runtime': 7.0093, 'eval_samples_per_second': 59.207, 'eval_steps_per_second': 3.709, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

VAL [ko]: {'eval_loss': 0.11504897475242615, 'eval_f1_token': 0.04896626768226332, 'eval_runtime': 5.9132, 'eval_samples_per_second': 60.204, 'eval_steps_per_second': 3.89, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

VAL [te]: {'eval_loss': 0.07522919774055481, 'eval_f1_token': 0.07900852052672347, 'eval_runtime': 5.9045, 'eval_samples_per_second': 65.035, 'eval_steps_per_second': 4.065, 'epoch': 1.0}
