# WEEK 37


In [1]:
import math
from collections import Counter
from typing import List, Dict, Iterable

import pandas as pd

# NLTK
from nltk.tokenize import wordpunct_tokenize
from nltk.util import ngrams
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import MLE, Laplace, KneserNeyInterpolated,  WittenBellInterpolated


In [None]:
## K = 3

#DOWNLOAD DATASET

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
print(df_train.head())
print(df_val.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


                                            question  \
0  উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...   
1           দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?   
2  মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...   
3  আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...   
4          বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?   

                                             context lang  answerable  \
0  WikiLeaks () is an international non-profit or...   bn        True   
1  The war in Europe concluded with an invasion o...   bn        True   
2  Same-sex marriage in the United States expande...   bn       False   
3  The exact number of Arab casualties is unknown...   bn        True   
4  As Thomas Hall (2000) notes, "The Sung Empire ...   bn        True   

   answer_start        answer answer_inlang  
0           182          2006          None  
1            48       Germany          None  
2            -1            no          None  
3            39       unknown          N

In [None]:

# Which text to model (use 'question_text' for the 3 languages; switch to a context column for English)
QUESTION_FIELD = "question"

CONTEXT_FIELD = "context"

# Which column has language codes for that text
LANG_FIELD = "lang"  # for questions; change if you model contexts with a different column

# Which languages to run (Arabic, Korean, Telugu)
LANGUAGES = ["ar", "ko", "te", "en"]

# n-gram orders to train
N_ORDERS = [2, 3]  # bigram, trigram

# Keep tokens with frequency >= MIN_FREQ, map others to <UNK>
MIN_FREQ = 3

SMOOTHINGS= ["laplace", "kn", "wb"]


In [None]:
import re
def tokenize_column(series):
    out = []
    for x in series:
        if not isinstance(x, str):
            x = "" if x is None else str(x)
        x = x.lower()
        out.append(wordpunct_tokenize(x))
    return out


def build_vocab(tokenized: List[List[str]], min_freq: int = 1) -> Dict[str, int]:
    cnt = Counter(w for s in tokenized for w in s)
    vocab = {w: i for i,(w,c) in enumerate(cnt.items()) if c >= min_freq}
    vocab["<OOV>"] = len(vocab)
    return vocab

def apply_unk(tokenized: List[List[str]], vocab: Dict[str,int]) -> List[List[str]]:
    return [[w if w in vocab else "<OOV>" for w in s] for s in tokenized]


In [None]:
def train_lm(tokenized: List[List[str]], order: int = 2, smoothing: str = "laplace"):
    smoothing = (smoothing or "").lower()
    if smoothing in {"", "mle"}:
        model = MLE(order)
    elif smoothing in {"laplace", "addone", "add-one"}:
        model = Laplace(order)
    elif smoothing in {"kn", "kneserney", "kneser-ney"}:
        model = KneserNeyInterpolated(order)
    elif smoothing in {"wb"}:
        model = WittenBellInterpolated(order)
    else:
        raise ValueError(f"Unsupported smoothing: {smoothing}")

    train_data, vocab = padded_everygram_pipeline(order, tokenized)
    model.fit(train_data, vocab)
    return model


In [None]:
def corpus_perplexity(model, tokenized: List[List[str]], order: int) -> dict:
    total_logp = 0.0
    total_count = 0
    for sent in tokenized:
        padded = list(pad_both_ends(sent, n=order))  # add <s>, </s>
        for ng in ngrams(padded, order):
            ctx = ng[:-1]
            w   = ng[-1]
            p = model.score(w, ctx)
            if p <= 0.0:
                p = 1e-12  # guard
            total_logp += math.log(p)
            total_count += 1
    ce  = - total_logp / max(total_count, 1)   # cross-entropy
    ppl = math.exp(ce)                         # perplexity
    return {"cross_entropy": ce, "perplexity": ppl, "tokens_counted": total_count}


In [None]:
results = []

for lang in LANGUAGES:
    # pick the text column(s)
    if lang == "en":
        tr_series = df_train[CONTEXT_FIELD].dropna().astype(str)
        va_series = df_val[CONTEXT_FIELD].dropna().astype(str)
        # exclude KN for English
        smoothings_here = [s for s in SMOOTHINGS
                           if s.lower() not in {"kn"}]
    else:
        tr_series = df_train[df_train[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str)
        va_series = df_val[df_val[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str)
        # use all listed smoothings
        smoothings_here = SMOOTHINGS[:]

    print(f"\n▶ Language: {lang} | train={len(tr_series)} | valid={len(va_series)}")

    # tokenize
    tr_tok = tokenize_column(tr_series)
    va_tok = tokenize_column(va_series)

    # vocab + <UNK>
    vocab     = build_vocab(tr_tok, min_freq=MIN_FREQ)
    tr_tok_u  = apply_unk(tr_tok, vocab)
    va_tok_u  = apply_unk(va_tok, vocab)

    # train/eval for each n and smoothing
    for n in N_ORDERS:
        for s in smoothings_here:
            lm = train_lm(tr_tok_u, order=n, smoothing=s)
            metrics = corpus_perplexity(lm, va_tok_u, order=n)
            results.append({
                "language": lang,
                "order": n,
                "smoothing": s,
                **metrics
            })
            print(f"  n={n}  {s}: PPL={metrics['perplexity']:.2f}  CE={metrics['cross_entropy']:.4f}  tokens={metrics['tokens_counted']}")



▶ Language: ar | train=2558 | valid=415
  n=2  laplace: PPL=40.55  CE=3.7025  tokens=3588
  n=2  kn: PPL=15.76  CE=2.7572  tokens=3588
  n=2  wb: PPL=15.03  CE=2.7103  tokens=3588
  n=3  laplace: PPL=63.82  CE=4.1561  tokens=4003
  n=3  kn: PPL=14.39  CE=2.6663  tokens=4003
  n=3  wb: PPL=11.40  CE=2.4335  tokens=4003

▶ Language: ko | train=2422 | valid=356
  n=2  laplace: PPL=18.25  CE=2.9039  tokens=2462
  n=2  kn: PPL=7.60  CE=2.0275  tokens=2462
  n=2  wb: PPL=7.57  CE=2.0240  tokens=2462
  n=3  laplace: PPL=25.04  CE=3.2206  tokens=2818
  n=3  kn: PPL=6.22  CE=1.8280  tokens=2818
  n=3  wb: PPL=5.57  CE=1.7181  tokens=2818

▶ Language: te | train=1355 | valid=384
  n=2  laplace: PPL=21.19  CE=3.0533  tokens=12658
  n=2  kn: PPL=14.07  CE=2.6438  tokens=12658
  n=2  wb: PPL=14.11  CE=2.6467  tokens=12658
  n=3  laplace: PPL=28.37  CE=3.3454  tokens=13042
  n=3  kn: PPL=7.11  CE=1.9619  tokens=13042
  n=3  wb: PPL=6.70  CE=1.9023  tokens=13042

▶ Language: en | train=15343 | valid

In [None]:
res_df = pd.DataFrame(results).sort_values(["language","order"])
res_df


Unnamed: 0,language,order,smoothing,cross_entropy,perplexity,tokens_counted
0,ar,2,laplace,3.702481,40.547768,3588
1,ar,2,kn,2.757217,15.755938,3588
2,ar,2,wb,2.71026,15.033185,3588
3,ar,3,laplace,4.156133,63.824205,4003
4,ar,3,kn,2.666342,14.387244,4003
5,ar,3,wb,2.433498,11.39869,4003
18,en,2,laplace,7.404704,1643.697558,363290
19,en,2,wb,5.422244,226.386513,363290
20,en,3,laplace,9.063613,8635.294615,366301
21,en,3,wb,5.031582,153.175181,366301


### POTENTIAL NEURAL MODEL ???

In [None]:
# If needed (Colab/new env), uncomment:
# !pip install -q bpemb torch

import math, re, random
from typing import List, Dict, Tuple

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from bpemb import BPEmb


In [None]:
SEED      = 42
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"

# training knobs (start small, bump after it works)
EPOCHS    = 10
BATCH     = 128
SEQ_LEN   = 120
EMB_DIM   = 100      # must match bpemb dim
HIDDEN    = 256
LAYERS    = 2
DROPOUT   = 0.2
LR        = 2e-3
CLIP      = 1.0

# bpemb options
# Option A (per-language): ar/ko/te/en separately — best quality
BPE_PER_LANGUAGE = True
BP_VS   = 5000    # vocab size
BP_DIM  = EMB_DIM   # should equal EMB_DIM

# Option B (single multilingual model): use 'multi'
# BPE_PER_LANGUAGE = False
# BP_LANG = "multi"     # uncomment to try multilingual model

# light text normalization
LOWERCASE    = True
FOLD_NUMBERS = True

# throttle English contexts (they’re huge)
EN_MAX_ROWS_TRAIN    = 30000
EN_MAX_ROWS_VALID    = 5000
EN_MAX_TOK_PER_DOC   = 400      # subword pieces per doc (for speed)
FREEZE_EMBEDDINGS    = True     # False = fine-tune embeddings


In [None]:
def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    if LOWERCASE:
        s = s.lower()
    if FOLD_NUMBERS:
        s = re.sub(r"\d+", "<num>", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def get_series(lang):
    if lang == "en":
        tr = df_train[CONTEXT_FIELD].dropna().astype(str)
        va = df_val[CONTEXT_FIELD].dropna().astype(str)
        if len(tr) > EN_MAX_ROWS_TRAIN:
            tr = tr.sample(EN_MAX_ROWS_TRAIN, random_state=SEED)
        if len(va) > EN_MAX_ROWS_VALID:
            va = va.sample(EN_MAX_ROWS_VALID, random_state=SEED)
        tr = tr.apply(normalize_text)
        va = va.apply(normalize_text)
        return tr, va, EN_MAX_TOK_PER_DOC
    else:
        tr = df_train[df_train[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        va = df_val[df_val[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        return tr, va, None


In [None]:
def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    if LOWERCASE:
        s = s.lower()
    if FOLD_NUMBERS:
        s = re.sub(r"\d+", "<num>", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def get_series(lang):
    if lang == "en":
        tr = df_train[CONTEXT_FIELD].dropna().astype(str)
        va = df_val[CONTEXT_FIELD].dropna().astype(str)
        if len(tr) > EN_MAX_ROWS_TRAIN:
            tr = tr.sample(EN_MAX_ROWS_TRAIN, random_state=SEED)
        if len(va) > EN_MAX_ROWS_VALID:
            va = va.sample(EN_MAX_ROWS_VALID, random_state=SEED)
        tr = tr.apply(normalize_text)
        va = va.apply(normalize_text)
        return tr, va, EN_MAX_TOK_PER_DOC
    else:
        tr = df_train[df_train[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        va = df_val[df_val[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        return tr, va, None


In [None]:
_bp_cache = {}

def load_bpemb_for_lang(lang: str):
    if BPE_PER_LANGUAGE:
        key = f"{lang}:{BP_VS}:{BP_DIM}"
        if key not in _bp_cache:
            _bp_cache[key] = BPEmb(lang=lang, vs=BP_VS, dim=BP_DIM)
        return _bp_cache[key]
    else:
        key = f"multi:{BP_VS}:{BP_DIM}"
        if key not in _bp_cache:
            _bp_cache[key] = BPEmb(lang="multi", vs=BP_VS, dim=BP_DIM)
        return _bp_cache[key]

def build_embedding_matrix(bp: BPEmb):
    # bp.emb.vectors shape: (vs, dim)
    pad_vec = np.zeros((1, BP_DIM), dtype=np.float32)
    eos_vec = np.zeros((1, BP_DIM), dtype=np.float32)  # simple; could be mean vector
    mat = np.concatenate([bp.emb.vectors, pad_vec, eos_vec], axis=0).astype(np.float32)
    pad_id = bp.emb.vectors.shape[0]        # PAD index
    eos_id = bp.emb.vectors.shape[0] + 1    # EOS index
    return torch.tensor(mat), pad_id, eos_id


In [None]:
def encode_docs_to_stream(bp: BPEmb, docs: List[str], eos_id: int, max_len_per_doc: int = None):
    ids = []
    for d in docs:
        wp = bp.encode_ids(d)  # list of subword ids (0..vs-1)
        if max_len_per_doc:    # throttle long docs (esp. English)
            wp = wp[:max_len_per_doc]
        ids.extend(wp + [eos_id])
    return np.array(ids, dtype=np.int64)

class WPStreamDataset(Dataset):
    def __init__(self, id_stream: np.ndarray, seq_len: int):
        self.data = torch.tensor(id_stream, dtype=torch.long)
        self.seq_len = seq_len
        self.num_seq = max(0, (len(self.data) - 1) // seq_len)

    def __len__(self): return self.num_seq
    def __getitem__(self, i):
        s = i * self.seq_len
        x = self.data[s : s+self.seq_len]
        y = self.data[s+1 : s+1+self.seq_len]
        return x, y

def make_loaders(id_train: np.ndarray, id_valid: np.ndarray, seq_len: int, batch: int):
    tr_ds = WPStreamDataset(id_train, seq_len)
    va_ds = WPStreamDataset(id_valid, seq_len)
    tr_dl = DataLoader(tr_ds, batch_size=batch, shuffle=True,  drop_last=True)
    va_dl = DataLoader(va_ds, batch_size=batch, shuffle=False, drop_last=False)
    return tr_dl, va_dl


In [None]:
def encode_docs_to_stream(bp: BPEmb, docs: List[str], eos_id: int, max_len_per_doc: int = None):
    ids = []
    for d in docs:
        wp = bp.encode_ids(d)  # list of subword ids (0..vs-1)
        if max_len_per_doc:    # throttle long docs (esp. English)
            wp = wp[:max_len_per_doc]
        ids.extend(wp + [eos_id])
    return np.array(ids, dtype=np.int64)

class WPStreamDataset(Dataset):
    def __init__(self, id_stream: np.ndarray, seq_len: int):
        self.data = torch.tensor(id_stream, dtype=torch.long)
        self.seq_len = seq_len
        self.num_seq = max(0, (len(self.data) - 1) // seq_len)

    def __len__(self): return self.num_seq
    def __getitem__(self, i):
        s = i * self.seq_len
        x = self.data[s : s+self.seq_len]
        y = self.data[s+1 : s+1+self.seq_len]
        return x, y

def make_loaders(id_train: np.ndarray, id_valid: np.ndarray, seq_len: int, batch: int):
    tr_ds = WPStreamDataset(id_train, seq_len)
    va_ds = WPStreamDataset(id_valid, seq_len)
    tr_dl = DataLoader(tr_ds, batch_size=batch, shuffle=True,  drop_last=True)
    va_dl = DataLoader(va_ds, batch_size=batch, shuffle=False, drop_last=False)
    return tr_dl, va_dl


In [None]:
class WPLSTMLM(nn.Module):
    def __init__(self, emb_matrix: torch.Tensor, pad_id: int, hidden=256, layers=2, dropout=0.2, freeze=True):
        super().__init__()
        V, D = emb_matrix.shape
        self.embed = nn.Embedding.from_pretrained(emb_matrix, freeze=freeze, padding_idx=pad_id)
        self.lstm  = nn.LSTM(D, hidden, num_layers=layers, dropout=dropout, batch_first=True)
        self.drop  = nn.Dropout(dropout)
        self.head  = nn.Linear(hidden, V)

    def forward(self, x, h=None):
        x = self.embed(x)
        x, h = self.lstm(x, h)
        x = self.drop(x)
        logits = self.head(x)
        return logits, h


In [None]:
def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss, total_tok = 0.0, 0
    crit = nn.CrossEntropyLoss(reduction="sum")  # sum, we divide later
    for x, y in loader:
        x = x.to(DEVICE); y = y.to(DEVICE)
        logits, _ = model(x)
        loss = crit(logits.reshape(-1, logits.size(-1)), y.reshape(-1))
        total_loss += loss.item()
        total_tok  += y.numel()
    ce  = total_loss / max(total_tok, 1)   # natural log
    ppl = math.exp(ce)
    return ce, ppl, total_tok

def train_bpemb_lm(lang, train_series, valid_series, max_tok_per_doc=None):
    set_seed()
    bp   = load_bpemb_for_lang(lang if BPE_PER_LANGUAGE else "multi")
    embM, pad_id, eos_id = build_embedding_matrix(bp)

    ids_tr = encode_docs_to_stream(bp, list(train_series), eos_id, max_tok_per_doc)
    ids_va = encode_docs_to_stream(bp, list(valid_series), eos_id, max_tok_per_doc)

    tr_dl, va_dl = make_loaders(ids_tr, ids_va, SEQ_LEN, BATCH)

    model = WPLSTMLM(embM, pad_id, hidden=HIDDEN, layers=LAYERS, dropout=DROPOUT, freeze=FREEZE_EMBEDDINGS).to(DEVICE)
    opt   = torch.optim.AdamW(model.parameters(), lr=LR)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)

    print(f"[{lang}] subword_vocab={embM.size(0)} train_batches={len(tr_dl)} valid_batches={len(va_dl)} device={DEVICE}")

    best = float("inf")
    for ep in range(1, EPOCHS+1):
        model.train()
        total, steps = 0.0, 0
        crit = nn.CrossEntropyLoss()
        for x, y in tr_dl:
            x = x.to(DEVICE); y = y.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            logits, _ = model(x)
            loss = crit(logits.reshape(-1, logits.size(-1)), y.reshape(-1))
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            opt.step()
            total += loss.item(); steps += 1
        sched.step()
        ce, ppl, tok = evaluate(model, va_dl)
        best = min(best, ppl)
        print(f"  epoch {ep:02d}  train_ce={total/max(steps,1):.4f}  val_ce={ce:.4f}  val_ppl={ppl:.2f}  tokens={tok}")
    return best


In [None]:
bpemb_results = []
for lang in LANGUAGES:
    tr, va, max_len = get_series(lang)
    print(f"\n▶ BPEmb LSTM on {lang}: train_rows={len(tr)} valid_rows={len(va)}")
    best_ppl = train_bpemb_lm(lang if lang!="en" else "en", tr, va, max_tok_per_doc=max_len)
    bpemb_results.append({"language": lang, "model": "bpemb_lstm", "best_ppl": best_ppl})

bpemb_results



▶ BPEmb LSTM on ar: train_rows=2558 valid_rows=415
[ar] subword_vocab=25002 train_batches=3 valid_batches=1 device=cuda
  epoch 01  train_ce=10.0721  val_ce=9.7612  val_ppl=17347.82  tokens=4080
  epoch 02  train_ce=8.9951  val_ce=7.9363  val_ppl=2796.89  tokens=4080
  epoch 03  train_ce=7.4374  val_ce=7.3163  val_ppl=1504.69  tokens=4080
  epoch 04  train_ce=6.8752  val_ce=7.2103  val_ppl=1353.33  tokens=4080
  epoch 05  train_ce=6.7198  val_ce=7.2457  val_ppl=1402.10  tokens=4080
  epoch 06  train_ce=6.7010  val_ce=7.2711  val_ppl=1438.09  tokens=4080
  epoch 07  train_ce=6.6713  val_ce=7.2776  val_ppl=1447.47  tokens=4080
  epoch 08  train_ce=6.6410  val_ce=7.2752  val_ppl=1444.05  tokens=4080
  epoch 09  train_ce=6.6239  val_ce=7.2728  val_ppl=1440.52  tokens=4080
  epoch 10  train_ce=6.6241  val_ce=7.2719  val_ppl=1439.28  tokens=4080

▶ BPEmb LSTM on ko: train_rows=2422 valid_rows=356
[ko] subword_vocab=25002 train_batches=3 valid_batches=1 device=cuda
  epoch 01  train_ce=10.05

[{'language': 'ar', 'model': 'bpemb_lstm', 'best_ppl': 1353.3297535724187},
 {'language': 'ko', 'model': 'bpemb_lstm', 'best_ppl': 625.1960137316046},
 {'language': 'te', 'model': 'bpemb_lstm', 'best_ppl': 2254.6223761306815},
 {'language': 'en', 'model': 'bpemb_lstm', 'best_ppl': 216.63805851503267}]