# WEEK 37


In [1]:
!pip install -q camel-tools konlpy indic-nlp-library
!apt -y install default-jdk

## !pip install --upgrade --force-reinstall konlpy camel-tools indic-nlp-library
# if having  problems in next block

#"ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject"



[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.3.3 which is incompatible.
cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
dask-cudf-cu12 25.6.0 requires pandas<2.2.4dev0,>=2.0, but you have pandas 2.3.3 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 

In [1]:
import math, pandas as pd, nltk
from collections import Counter
from nltk.util import ngrams
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import Laplace, KneserNeyInterpolated, WittenBellInterpolated
from nltk.tokenize import wordpunct_tokenize
from camel_tools.tokenizers.word import simple_word_tokenize # Arabic
from konlpy.tag import Okt # Korean
from indicnlp.tokenize import indic_tokenize # Telugu

import nltk
nltk.download('punkt'); nltk.download('punkt_tab')

okt = Okt() # for Korean

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])

QUESTION, CONTEXT, LANG = "question", "context", "lang"
LANGUAGES = ["ar", "ko", "te", "en"]
Ns = [2, 3]
SMOOTHINGS = ["laplace", "kn", "wb"]

# tokenizers per language
def select_tokenizer(lang):
    if lang == "ar":
        return simple_word_tokenize
    if lang == "ko":
        return okt.morphs
    if lang == "te":
        return lambda s: list(indic_tokenize.trivial_tokenize(s))
    if lang == "en":
        return nltk.word_tokenize

def tokenize_column(series, lang):
    tokenize = select_tokenizer(lang)
    return [tokenize(str(x).lower()) for x in series.dropna()]

def build_vocab(tokenized, min_freq):
    counter = Counter(w for s in tokenized for w in s)
    vocab = {w:i for i,(w,c) in enumerate(counter.items()) if c >= min_freq}
    vocab["<OOV>"] = len(vocab)

    return vocab

# replace all tokens that are not part of the vocab with <OOV> token
def apply_oov(tokenized, vocab):
    return [[w if w in vocab else "<OOV>" for w in s] for s in tokenized]

def train_lm(tokenized, n, smoothing):
    train_grams, vocab = padded_everygram_pipeline(n, tokenized)
    if smoothing == "laplace":
      model = Laplace(n)
    elif smoothing == "kn":
      model = KneserNeyInterpolated(n)
    elif smoothing == "wb":
      model = WittenBellInterpolated(n)
    else:
      raise ValueError(smoothing)
    model.fit(train_grams, vocab)

    return model

# calculate perplexity
def perplexity(model, tokenized_sentences, n):
    sum_log_probs = 0.0
    count_ngrams = 0
    for sentence in tokenized_sentences:
        for ngram in ngrams(list(pad_both_ends(sentence, n=n)), n):
            context, w = ngram[:-1], ngram[-1]
            prob = model.score(w, context) or 1e-12
            sum_log_probs += math.log(prob); count_ngrams += 1
    cross_entropy = - sum_log_probs / max(count_ngrams, 1)

    return math.exp(cross_entropy), cross_entropy, count_ngrams

# train and evaluate
rows = []
for L in LANGUAGES:
    if L == "en":
        tr_txt = df_train[CONTEXT]
        val_txt = df_val[CONTEXT]
        smooths = [s for s in SMOOTHINGS if s != "kn"]  # leave out KN for english (copmutational issue)
    else:
        tr_txt = df_train.loc[df_train[LANG]==L, QUESTION]
        val_txt = df_val.loc[df_val[LANG]==L, QUESTION]
        smooths = SMOOTHINGS

    tr_tok = tokenize_column(tr_txt, L)
    val_tok = tokenize_column(val_txt, L)
    vocab = build_vocab(tr_tok, 3)
    tr_tok = apply_oov(tr_tok, vocab)
    val_tok = apply_oov(val_tok, vocab)

    for n in Ns:
        for s in smooths:
            model = train_lm(tr_tok, n, s)
            ppl, cross_entropy, num_tokens = perplexity(model, val_tok, n)
            rows.append(dict(language=L, order=n, smoothing=s, vocab_size=len(vocab), tokens=num_tokens, perplexity=round(ppl, 2), cross_entropy=round(cross_entropy, 4)))
            print(f"{L} n={n} {s}: PPL={ppl:.2f} CE={cross_entropy:.4f}")

res_df = pd.DataFrame(rows).sort_values(["language","order","smoothing"])
print(res_df)


ar n=2 laplace: PPL=34.56 CE=3.5428
ar n=2 kn: PPL=13.23 CE=2.5825
ar n=2 wb: PPL=12.83 CE=2.5519
ar n=3 laplace: PPL=54.72 CE=4.0023
ar n=3 kn: PPL=12.03 CE=2.4878
ar n=3 wb: PPL=9.78 CE=2.2801
ko n=2 laplace: PPL=30.41 CE=3.4148
ko n=2 kn: PPL=11.89 CE=2.4756
ko n=2 wb: PPL=11.51 CE=2.4429
ko n=3 laplace: PPL=49.59 CE=3.9037
ko n=3 kn: PPL=9.71 CE=2.2728
ko n=3 wb: PPL=8.16 CE=2.0992
te n=2 laplace: PPL=30.61 CE=3.4214
te n=2 kn: PPL=10.99 CE=2.3966
te n=2 wb: PPL=10.89 CE=2.3879
te n=3 laplace: PPL=43.59 CE=3.7747
te n=3 kn: PPL=8.54 CE=2.1443
te n=3 wb: PPL=7.28 CE=1.9854
en n=2 laplace: PPL=1508.92 CE=7.3192
en n=2 wb: PPL=209.77 CE=5.3460
en n=3 laplace: PPL=8069.52 CE=8.9958
en n=3 wb: PPL=146.71 CE=4.9885
   language  order smoothing  vocab_size  tokens  perplexity  cross_entropy
1        ar      2        kn         965    3457       13.23         2.5825
0        ar      2   laplace         965    3457       34.56         3.5428
2        ar      2        wb         965    3457 