# WEEK 37


In [1]:
import math
from collections import Counter
from typing import List, Dict, Iterable

import pandas as pd

# NLTK
from nltk.tokenize import wordpunct_tokenize
from nltk.util import ngrams
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import MLE, Laplace, KneserNeyInterpolated,  WittenBellInterpolated


In [None]:
## K = 3

#DOWNLOAD DATASET

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
print(df_train.head())
print(df_val.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


                                            question  \
0  উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...   
1           দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?   
2  মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...   
3  আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...   
4          বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?   

                                             context lang  answerable  \
0  WikiLeaks () is an international non-profit or...   bn        True   
1  The war in Europe concluded with an invasion o...   bn        True   
2  Same-sex marriage in the United States expande...   bn       False   
3  The exact number of Arab casualties is unknown...   bn        True   
4  As Thomas Hall (2000) notes, "The Sung Empire ...   bn        True   

   answer_start        answer answer_inlang  
0           182          2006          None  
1            48       Germany          None  
2            -1            no          None  
3            39       unknown          N

In [None]:

QUESTION_FIELD = "question"

CONTEXT_FIELD = "context"

LANG_FIELD = "lang"

LANGUAGES = ["ar", "ko", "te", "en"]

N_ORDERS = [2, 3]

MIN_FREQ = 3

SMOOTHINGS= ["laplace", "kn", "wb"]


In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # ne
from collections import Counter
from typing import List, Dict

from camel_tools.tokenizers.word import simple_word_tokenize      # Arabic
from konlpy.tag import Okt                                        # Korean
from indicnlp.tokenize import indic_tokenize                      # Telugu

okt = Okt()

def get_tokenizer(lang: str):
    if lang == "en":
        return nltk.word_tokenize
    elif lang == "ar":
        return simple_word_tokenize
    elif lang == "ko":
        return okt.morphs
    elif lang == "te":
        return lambda text: list(indic_tokenize.trivial_tokenize(text))
    else:
        # fallback: split on words
        return nltk.word_tokenize

def tokenize_column(series, lang: str):
    tokenizer = get_tokenizer(lang)
    out = []
    for x in series:
        if not isinstance(x, str):
            x = "" if x is None else str(x)
        x = x.lower()
        out.append(tokenizer(x))
    return out

def build_vocab(tokenized: List[List[str]], min_freq: int = 1) -> Dict[str, int]:
    cnt = Counter(w for s in tokenized for w in s)
    vocab = {w: i for i, (w, c) in enumerate(cnt.items()) if c >= min_freq}
    vocab["<OOV>"] = len(vocab)
    return vocab

def apply_oov(tokenized: List[List[str]], vocab: Dict[str, int]) -> List[List[str]]:
    return [[w if w in vocab else "<OOV>" for w in s] for s in tokenized]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def train_lm(tokenized: List[List[str]], order: int = 2, smoothing: str = "laplace"):
    smoothing = (smoothing or "").lower()
    if smoothing in {"", "mle"}:
        model = MLE(order)
    elif smoothing in {"laplace", "addone", "add-one"}:
        model = Laplace(order)
    elif smoothing in {"kn", "kneserney", "kneser-ney"}:
        model = KneserNeyInterpolated(order)
    elif smoothing in {"wb"}:
        model = WittenBellInterpolated(order)
    else:
        raise ValueError(f"Unsupported smoothing: {smoothing}")

    train_data, vocab = padded_everygram_pipeline(order, tokenized)

    model.fit(train_data, vocab)
    return model


In [None]:
def corpus_perplexity(model, tokenized: List[List[str]], order: int) -> dict:
    total_logp = 0.0
    total_count = 0
    for sent in tokenized:
        padded = list(pad_both_ends(sent, n=order))  # add <s>, </s>
        for ng in ngrams(padded, order):
            ctx = ng[:-1]
            w   = ng[-1]
            p = model.score(w, ctx)
            if p <= 0.0:
                p = 1e-12  # guard
            total_logp += math.log(p)
            total_count += 1
    ce  = - total_logp / max(total_count, 1)   # cross-entropy
    ppl = math.exp(ce)                         # perplexity
    return {"cross_entropy": ce, "perplexity": ppl, "tokens_counted": total_count}


In [None]:
results = []

for lang in LANGUAGES:
    # pick the text column(s)
    if lang == "en":
        tr_series = df_train[CONTEXT_FIELD].dropna().astype(str)
        va_series = df_val[CONTEXT_FIELD].dropna().astype(str)
        # exclude KN for English
        smoothings_here = [s for s in SMOOTHINGS
                           if s.lower() not in {"kn"}]
    else:
        tr_series = df_train[df_train[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str)
        va_series = df_val[df_val[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str)
        # use all listed smoothings
        smoothings_here = SMOOTHINGS[:]

    print(f"\n▶ Language: {lang} | train={len(tr_series)} | valid={len(va_series)}")

    # tokenize
    tr_tok = tokenize_column(tr_series, lang)
    va_tok = tokenize_column(va_series, lang)

    # vocab + <UNK>
    vocab     = build_vocab(tr_tok, min_freq=MIN_FREQ)
    tr_tok_u  = apply_oov(tr_tok, vocab)
    va_tok_u  = apply_oov(va_tok, vocab)

    # train/eval for each n and smoothing
    for n in N_ORDERS:
        for s in smoothings_here:
            lm = train_lm(tr_tok_u, order=n, smoothing=s)
            metrics = corpus_perplexity(lm, va_tok_u, order=n)
            results.append({
                "language": lang,
                "vocab_size" : len(vocab),
                "order": n,
                "smoothing": s,
                **metrics
            })
            print(f"  n={n}  {s}: PPL={metrics['perplexity']:.2f}  CE={metrics['cross_entropy']:.4f}  tokens={metrics['tokens_counted']}")



▶ Language: ar | train=2558 | valid=415
  n=2  laplace: PPL=34.56  CE=3.5428  tokens=3457
  n=2  kn: PPL=13.23  CE=2.5825  tokens=3457
  n=2  wb: PPL=12.83  CE=2.5519  tokens=3457
  n=3  laplace: PPL=54.72  CE=4.0023  tokens=3872
  n=3  kn: PPL=12.03  CE=2.4878  tokens=3872
  n=3  wb: PPL=9.78  CE=2.2801  tokens=3872

▶ Language: ko | train=2422 | valid=356
  n=2  laplace: PPL=30.41  CE=3.4148  tokens=3608
  n=2  kn: PPL=11.89  CE=2.4756  tokens=3608
  n=2  wb: PPL=11.51  CE=2.4429  tokens=3608
  n=3  laplace: PPL=49.59  CE=3.9037  tokens=3964
  n=3  kn: PPL=9.71  CE=2.2728  tokens=3964
  n=3  wb: PPL=8.16  CE=2.0992  tokens=3964

▶ Language: te | train=1355 | valid=384
  n=2  laplace: PPL=30.61  CE=3.4214  tokens=3074
  n=2  kn: PPL=10.99  CE=2.3966  tokens=3074
  n=2  wb: PPL=10.89  CE=2.3879  tokens=3074
  n=3  laplace: PPL=43.59  CE=3.7747  tokens=3458
  n=3  kn: PPL=8.54  CE=2.1443  tokens=3458
  n=3  wb: PPL=7.28  CE=1.9854  tokens=3458

▶ Language: en | train=15343 | valid=3011

In [None]:
res_df = pd.DataFrame(results).sort_values(["language","order"])
res_df


Unnamed: 0,language,vocab_size,order,smoothing,cross_entropy,perplexity,tokens_counted
0,ar,965,2,laplace,3.542808,34.563823,3457
1,ar,965,2,kn,2.582526,13.230522,3457
2,ar,965,2,wb,2.55188,12.831205,3457
3,ar,965,3,laplace,4.00228,54.722794,3872
4,ar,965,3,kn,2.48778,12.034535,3872
5,ar,965,3,wb,2.280074,9.777403,3872
18,en,29119,2,laplace,7.319152,1508.923553,353455
19,en,29119,2,wb,5.346028,209.773333,353455
20,en,29119,3,laplace,8.995849,8069.521425,356466
21,en,29119,3,wb,4.988482,146.713584,356466


### POTENTIAL NEURAL MODEL ???

In [8]:
# !pip install -q pandas fsspec huggingface_hub transformers torch --upgrade
import math, numpy as np, pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

# --- Daten laden ---
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])

langs = ["ar","ko","te"]
val_questions = {L: df_val.loc[df_val["lang"]==L, "question"].astype(str).tolist() for L in langs}
val_contexts_en = df_val["context"].astype(str).tolist()

# --- Modell laden (Warning bzgl. Pooler ist erwartbar/ok) ---
device = "cuda" if torch.cuda.is_available() else "cpu"
name = "xlm-roberta-base"
tok  = AutoTokenizer.from_pretrained(name)
mdl  = AutoModelForMaskedLM.from_pretrained(name).to(device).eval()
torch.backends.cuda.matmul.allow_tf32 = True

@torch.inference_mode()
def pll_pseudo_perplexity_batched(texts, max_len=128, max_texts=100, pos_chunk=32):
    texts = [t for t in texts if isinstance(t, str) and t.strip()]
    if max_texts is not None:
        texts = texts[:max_texts]

    ppl_list = []

    for txt in texts:
        ids = tok(txt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to(device)
        if ids.size(1) <= 2:
            continue
        seq = ids[0]
        positions = list(range(1, seq.size(0)-1))  # ignoriere <s> und </s>
        logprob_sum = 0.0
        count = 0

        # in Chunks verarbeiten
        for i in range(0, len(positions), pos_chunk):
            chunk_pos = positions[i:i+pos_chunk]
            # Batch mit kopierten Sequenzen, jede mit *einer* anderen Position maskiert
            batch = seq.unsqueeze(0).repeat(len(chunk_pos), 1)
            for r, p in enumerate(chunk_pos):
                batch[r, p] = tok.mask_token_id

            out = mdl(input_ids=batch)
            logits = out.logits  # [B, T, V]
            # Hole Logprob am jeweils maskierten Index
            for r, p in enumerate(chunk_pos):
                tgt_id = int(seq[p].item())
                logprob = torch.log_softmax(logits[r, p], dim=-1)[tgt_id].item()
                logprob_sum += logprob
                count += 1

            # Speicher freigeben (wichtig in Colab)
            del batch, out, logits
            if device == "cuda":
                torch.cuda.empty_cache()

        if count > 0:
            ppl_list.append(math.exp(-logprob_sum / count))

    return float(np.mean(ppl_list)) if ppl_list else float("nan")

# --- Evaluierung (schnell & stabil) ---
results = []
for L in langs:
    ppl = pll_pseudo_perplexity_batched(val_questions[L], max_len=128, max_texts=100, pos_chunk=32)
    results.append(dict(model="XLM-R (PLL)", domain=f"questions-{L}", perplexity=round(ppl, 2)))

ppl_ctx = pll_pseudo_perplexity_batched(val_contexts_en, max_len=128, max_texts=50, pos_chunk=16)  # Kontexte sind lang → noch konservativer
results.append(dict(model="XLM-R (PLL)", domain="contexts-en", perplexity=round(ppl_ctx, 2)))

print(pd.DataFrame(results))


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


         model        domain  perplexity
0  XLM-R (PLL)  questions-ar      195.47
1  XLM-R (PLL)  questions-ko       10.56
2  XLM-R (PLL)  questions-te        8.37
3  XLM-R (PLL)   contexts-en        3.47
