In [1]:
# Install
!pip -q install transformers datasets accelerate seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [1]:
import torch; print("cuda?", torch.cuda.is_available())


cuda? True


In [7]:
# MINIMAL Week 40: span-based QA (ar/ko/te) with k=3 backbones

import pandas as pd, numpy as np, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

# --- data ---
langs = ["ar","ko","te"]
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
df_train = df_train[df_train.lang.isin(langs)].reset_index(drop=True)
df_val   = df_val[df_val.lang.isin(langs)].reset_index(drop=True)

def to_hf(df):  # keep only needed columns
    keep = ["lang","question","context","answerable","answer_start","answer"]
    return Dataset.from_pandas(df[keep], preserve_index=False)

ds_tr, ds_va = to_hf(df_train), to_hf(df_val)

# --- preprocessing to SQuAD-style start/end ---
def build_preprocess(tokenizer, max_len=384, stride=128):
    cls_id = tokenizer.cls_token_id
    def _prep(ex):
        q = ex["question"]; c = ex["context"]
        ans_text = ex["answer"] if ex["answerable"] else ""
        ans_start = ex["answer_start"] if ex["answerable"] else -1

        enc = tokenizer(
            q, c,
            max_length=max_len, truncation="only_second", stride=stride,
            return_offsets_mapping=True, return_token_type_ids=True
        )
        # default to CLS (unanswerable or lost by truncation)
        start = end = enc["input_ids"].index(cls_id)

        if ex["answerable"]:
            # find token span inside the context segment
            seq_ids = enc.sequence_ids()
            offsets = enc["offset_mapping"]
            # locate the first context token idx
            c_tok_idxs = [i for i,s in enumerate(seq_ids) if s==1]
            if c_tok_idxs:
                c0, c1 = c_tok_idxs[0], c_tok_idxs[-1]
                # character indexes of answer
                a0, a1 = ans_start, ans_start + len(ans_text)
                # move start forward to the token whose span starts after/before a0
                i = c0
                while i<=c1 and (offsets[i][0] <= a0 and offsets[i][1] <= a0):
                    i += 1
                # back up one step if we stepped past
                while i>c0 and offsets[i-1][0] <= a0 < offsets[i-1][1]:
                    i -= 1
                # if inside range, expand to cover full answer
                if c0 <= i <= c1:
                    j = i
                    while j<=c1 and offsets[j][0] < a1:
                        j += 1
                    start, end = i, min(j-1, c1)

        enc["start_positions"] = start
        enc["end_positions"]   = end
        enc.pop("offset_mapping")
        return enc
    return _prep

# --- tiny exact match / F1 on tokens (very minimal) ---
def simple_metrics(eval_preds, tokenizer=None):
    preds_start = np.argmax(eval_preds.predictions[0], -1)
    preds_end   = np.argmax(eval_preds.predictions[1], -1)

    # alte Version: label_ids ist Tupel (start, end)
    # neue Version: label_ids ist Dict
    labels = eval_preds.label_ids
    if isinstance(labels, dict):
        labels_start, labels_end = labels["start_positions"], labels["end_positions"]
    else:
        labels_start, labels_end = labels  # Tupel

    em = np.mean((preds_start == labels_start) & (preds_end == labels_end))

    # token-level overlap F1 (approx)
    f1s=[]
    for ps,pe,ls,le in zip(preds_start,preds_end,labels_start,labels_end):
        P=set(range(ps,pe+1)) if pe>=ps else set()
        G=set(range(ls,le+1)) if le>=ls else set()
        if not P and not G: f1=1.0
        elif not P or not G: f1=0.0
        else:
            prec = len(P&G)/len(P); rec = len(P&G)/len(G)
            f1 = 0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)
        f1s.append(f1)

    return {"em": float(em), "f1_token": float(np.mean(f1s))}


# --- k=3 backbones (change as you like) ---
MODELS = [
  "google-bert/bert-base-multilingual-cased",
  "distilbert/distilbert-base-multilingual-cased",
  "xlm-roberta-base"
]

for mname in MODELS:
    print("\n=== Training:", mname, "===")
    tok = AutoTokenizer.from_pretrained(mname, use_fast=True)
    prep = build_preprocess(tok)

    tr_enc = ds_tr.map(prep, remove_columns=ds_tr.column_names)
    va_enc = ds_va.map(prep, remove_columns=ds_va.column_names)

    model = AutoModelForQuestionAnswering.from_pretrained(mname)

    args = TrainingArguments(
        output_dir=f"wk40_{mname.split('/')[-1]}",
        learning_rate=5e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8,
        num_train_epochs=1, weight_decay=0.01, logging_steps=100, report_to="none",
    )

    def compute_metrics(p): return simple_metrics(p, tok)

    trainer = Trainer(
        model=model, args=args, tokenizer=tok,
        train_dataset=tr_enc, eval_dataset=va_enc,
        compute_metrics=compute_metrics
    )
    trainer.train()
    print(trainer.evaluate())



=== Training: google-bert/bert-base-multilingual-cased ===


Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,2.9742
200,2.2398
300,1.9879
400,1.635
500,1.7249
600,1.6256
700,1.5404


{'eval_loss': 1.5402641296386719, 'eval_em': 0.47965367965367967, 'eval_f1_token': 0.564578970553351, 'eval_runtime': 18.9358, 'eval_samples_per_second': 60.996, 'eval_steps_per_second': 7.657, 'epoch': 1.0}

=== Training: distilbert/distilbert-base-multilingual-cased ===


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,3.1675
200,2.5473
300,2.5208
400,2.2426
500,2.2422
600,2.1562
700,2.0421


{'eval_loss': 1.9802366495132446, 'eval_em': 0.37402597402597404, 'eval_f1_token': 0.44712074926530665, 'eval_runtime': 8.9814, 'eval_samples_per_second': 128.599, 'eval_steps_per_second': 16.144, 'epoch': 1.0}

=== Training: xlm-roberta-base ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,3.2804
200,2.692
300,2.629
400,2.4122
500,2.416
600,2.3835
700,2.2568


Step,Training Loss
100,3.2804
200,2.692
300,2.629
400,2.4122
500,2.416
600,2.3835
700,2.2568


{'eval_loss': 2.3212661743164062, 'eval_em': 0.2909090909090909, 'eval_f1_token': 0.36364621791364693, 'eval_runtime': 18.2604, 'eval_samples_per_second': 63.252, 'eval_steps_per_second': 7.941, 'epoch': 1.0}


In [9]:
# MINIMAL Week 40: span-based QA (ar/ko/te) with k=3 backbones

import pandas as pd, numpy as np, torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

# --- data ---
langs = ["ar","ko","te"]
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
df_train = df_train[df_train.lang.isin(langs)].reset_index(drop=True)
df_val   = df_val[df_val.lang.isin(langs)].reset_index(drop=True)

def to_hf(df):  # keep only needed columns
    keep = ["lang","question","context","answerable","answer_start","answer"]
    return Dataset.from_pandas(df[keep], preserve_index=False)

ds_tr, ds_va = to_hf(df_train), to_hf(df_val)

# --- preprocessing to SQuAD-style start/end ---
def build_preprocess(tokenizer, max_len=384, stride=128):
    cls_id = tokenizer.cls_token_id
    def _prep(ex):
        q = ex["question"]; c = ex["context"]
        ans_text  = ex["answer"] if ex["answerable"] else ""
        ans_start = ex["answer_start"] if ex["answerable"] else -1

        enc = tokenizer(
            q, c,
            max_length=max_len, truncation="only_second", stride=stride,
            return_offsets_mapping=True, return_token_type_ids=True
        )
        # default to CLS (unanswerable or lost by truncation)
        start = end = enc["input_ids"].index(cls_id)

        if ex["answerable"]:
            seq_ids = enc.sequence_ids()
            offsets = enc["offset_mapping"]
            c_tok_idxs = [i for i,s in enumerate(seq_ids) if s==1]
            if c_tok_idxs:
                c0, c1 = c_tok_idxs[0], c_tok_idxs[-1]
                a0, a1 = ans_start, ans_start + len(ans_text)
                i = c0
                while i<=c1 and (offsets[i][0] <= a0 and offsets[i][1] <= a0):
                    i += 1
                while i>c0 and offsets[i-1][0] <= a0 < offsets[i-1][1]:
                    i -= 1
                if c0 <= i <= c1:
                    j = i
                    while j<=c1 and offsets[j][0] < a1:
                        j += 1
                    start, end = i, min(j-1, c1)

        enc["start_positions"] = start
        enc["end_positions"]   = end
        enc.pop("offset_mapping")
        return enc
    return _prep

# --- sequence-labelling metrics (EM + Token-F1) mit leerem Output für Unanswerable ---
# Annahme: CLS-Token steht an Position 0 (gilt für BERT/XLM-R Standard-Tokenisierung).
def simple_metrics(eval_preds, tokenizer=None):
    ps = np.argmax(eval_preds.predictions[0], -1)
    pe = np.argmax(eval_preds.predictions[1], -1)

    labels = eval_preds.label_ids
    if isinstance(labels, dict):
        ls, le = labels["start_positions"], labels["end_positions"]
    else:
        ls, le = labels  # (start, end)

    def span_to_set(s, e):
        # Unanswerable: leere Menge (Aufgabenforderung)
        if s==0 and e==0:
            return set()
        if e < s:
            return set()
        return set(range(int(s), int(e)+1))

    # EM (exakt gleiche Token-Menge)
    em_list, f1_list = [], []
    for a,b,c,d in zip(ps,pe,ls,le):
        P, G = span_to_set(a,b), span_to_set(c,d)
        em_list.append(P == G)
        if not P and not G:
            f1 = 1.0
        elif not P or not G:
            f1 = 0.0
        else:
            inter = len(P & G)
            prec  = inter/len(P)
            rec   = inter/len(G)
            f1    = 0.0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)
        f1_list.append(f1)

    return {"em": float(np.mean(em_list)), "f1_token": float(np.mean(f1_list))}

# --- k=3 backbones ---
MODELS = [
  "google-bert/bert-base-multilingual-cased",
  "distilbert/distilbert-base-multilingual-cased",
  "xlm-roberta-base"
]

for mname in MODELS:
    print("\n=== Training:", mname, "===")
    tok = AutoTokenizer.from_pretrained(mname, use_fast=True)
    prep = build_preprocess(tok)

    tr_enc = ds_tr.map(prep, remove_columns=ds_tr.column_names)
    va_enc = ds_va.map(prep, remove_columns=ds_va.column_names)

    model = AutoModelForQuestionAnswering.from_pretrained(mname)

    args = TrainingArguments(
        output_dir=f"wk40_{mname.split('/')[-1]}",
        learning_rate=5e-5, per_device_train_batch_size=8, per_device_eval_batch_size=8,
        num_train_epochs=1, weight_decay=0.01, logging_steps=100, do_eval=True, report_to=[]
        # ggf. alte HF-Version: 'report_to'/'evaluation_strategy' entfernen
    )

    trainer = Trainer(
        model=model, args=args, tokenizer=tok,
        train_dataset=tr_enc, eval_dataset=va_enc,
        compute_metrics=simple_metrics
    )

    trainer.train()
    print("VAL (all):", trainer.evaluate())

    # --- Pflicht: pro Sprache evaluieren und vergleichen ---
    for L in langs:
        va_L = ds_va.filter(lambda ex, L=L: ex["lang"]==L)
        va_L_enc = va_L.map(prep, remove_columns=va_L.column_names)
        res_L = trainer.evaluate(eval_dataset=va_L_enc)
        print(f"VAL [{L}]:", res_L)



=== Training: google-bert/bert-base-multilingual-cased ===


Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,2.8285
200,2.0228
300,1.934
400,1.6466
500,1.6907
600,1.604
700,1.5303


VAL (all): {'eval_loss': 1.5111550092697144, 'eval_em': 0.5142857142857142, 'eval_f1_token': 0.5941411456755312, 'eval_runtime': 17.8615, 'eval_samples_per_second': 64.664, 'eval_steps_per_second': 8.118, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

VAL [ar]: {'eval_loss': 1.308828592300415, 'eval_em': 0.5783132530120482, 'eval_f1_token': 0.6586992695511058, 'eval_runtime': 6.5983, 'eval_samples_per_second': 62.895, 'eval_steps_per_second': 7.881, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

VAL [ko]: {'eval_loss': 1.2079746723175049, 'eval_em': 0.5421348314606742, 'eval_f1_token': 0.6355089895430546, 'eval_runtime': 5.5718, 'eval_samples_per_second': 63.893, 'eval_steps_per_second': 8.076, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

VAL [te]: {'eval_loss': 2.010852575302124, 'eval_em': 0.4192708333333333, 'eval_f1_token': 0.4860198596724014, 'eval_runtime': 6.0596, 'eval_samples_per_second': 63.371, 'eval_steps_per_second': 7.921, 'epoch': 1.0}

=== Training: distilbert/distilbert-base-multilingual-cased ===


Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,3.1574
200,2.5042
300,2.5218
400,2.2563
500,2.2842
600,2.2076
700,2.0923


VAL (all): {'eval_loss': 2.05647349357605, 'eval_em': 0.37922077922077924, 'eval_f1_token': 0.44577972332260773, 'eval_runtime': 9.7804, 'eval_samples_per_second': 118.093, 'eval_steps_per_second': 14.826, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

VAL [ar]: {'eval_loss': 1.8374948501586914, 'eval_em': 0.42891566265060244, 'eval_f1_token': 0.5083277836076332, 'eval_runtime': 3.6584, 'eval_samples_per_second': 113.436, 'eval_steps_per_second': 14.214, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

VAL [ko]: {'eval_loss': 2.0715506076812744, 'eval_em': 0.33707865168539325, 'eval_f1_token': 0.40414937448087657, 'eval_runtime': 3.0522, 'eval_samples_per_second': 116.636, 'eval_steps_per_second': 14.743, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

VAL [te]: {'eval_loss': 2.278899908065796, 'eval_em': 0.3645833333333333, 'eval_f1_token': 0.4167770128261772, 'eval_runtime': 3.298, 'eval_samples_per_second': 116.436, 'eval_steps_per_second': 14.554, 'epoch': 1.0}

=== Training: xlm-roberta-base ===


Map:   0%|          | 0/6335 [00:00<?, ? examples/s]

Map:   0%|          | 0/1155 [00:00<?, ? examples/s]

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
100,3.2616
200,2.6814
300,2.6581
400,2.4344
500,2.4158
600,2.3896
700,2.1918


Step,Training Loss
100,3.2616
200,2.6814
300,2.6581
400,2.4344
500,2.4158
600,2.3896
700,2.1918


VAL (all): {'eval_loss': 2.180102586746216, 'eval_em': 0.341991341991342, 'eval_f1_token': 0.4145376237267371, 'eval_runtime': 20.0061, 'eval_samples_per_second': 57.732, 'eval_steps_per_second': 7.248, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

VAL [ar]: {'eval_loss': 2.048280954360962, 'eval_em': 0.3469879518072289, 'eval_f1_token': 0.41613956160231147, 'eval_runtime': 7.3426, 'eval_samples_per_second': 56.52, 'eval_steps_per_second': 7.082, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

VAL [ko]: {'eval_loss': 1.9591587781906128, 'eval_em': 0.3792134831460674, 'eval_f1_token': 0.4522927361598218, 'eval_runtime': 5.8349, 'eval_samples_per_second': 61.013, 'eval_steps_per_second': 7.712, 'epoch': 1.0}


Filter:   0%|          | 0/1155 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

VAL [te]: {'eval_loss': 2.527325391769409, 'eval_em': 0.3020833333333333, 'eval_f1_token': 0.3778042272565769, 'eval_runtime': 6.149, 'eval_samples_per_second': 62.449, 'eval_steps_per_second': 7.806, 'epoch': 1.0}
