In [2]:
##IMPORTS
import pandas as pd
import polars as pl
from collections import Counter
from transformers import pipeline
from googletrans import Translator
import string
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import regex as re
from collections import Counter
from googletrans import Translator
import pickle
import numpy as np

# WEEK 36

In [3]:
#DOWNLOAD DATASET

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])

## Stats

In [4]:
#STATS

#SIZE

langs = ["ar", "ko", "te"]


train_counts = df_train[df_train["lang"].isin(langs)].groupby("lang").size()


val_counts = df_val[df_val["lang"].isin(langs)].groupby("lang").size()

size_df = pd.DataFrame({
    "train_size": train_counts,
    "val_size": val_counts
}).fillna(0).astype(int)

print("Dataset sizes for selected languages:")
print(size_df)



Dataset sizes for selected languages:
      train_size  val_size
lang                      
ar          2558       415
ko          2422       356
te          1355       384


In [5]:

## Each language punctuation 
PUNCT_RE = re.compile(r"\p{P}", re.UNICODE)

# ARABIC
ar_train_q = df_train[df_train["lang"] == "ar"]["question"].astype(str)
ar_val_q   = df_val[df_val["lang"] == "ar"]["question"].astype(str)

ar_train_punct = Counter(ch for q in ar_train_q for ch in PUNCT_RE.findall(q))
ar_val_punct   = Counter(ch for q in ar_val_q for ch in PUNCT_RE.findall(q))

print("Arabic — TRAIN punctuation (char -> count):")
print(ar_train_punct.most_common())
print("Arabic — VAL punctuation (char -> count):")
print(ar_val_punct.most_common())

# KOREAN
ko_train_q = df_train[df_train["lang"] == "ko"]["question"].astype(str)
ko_val_q   = df_val[df_val["lang"] == "ko"]["question"].astype(str)

ko_train_punct = Counter(ch for q in ko_train_q for ch in PUNCT_RE.findall(q))
ko_val_punct   = Counter(ch for q in ko_val_q for ch in PUNCT_RE.findall(q))

print("Korean — TRAIN punctuation (char -> count):")
print(ko_train_punct.most_common())
print("Korean — VAL punctuation (char -> count):")
print(ko_val_punct.most_common())

# TELUGU
te_train_q = df_train[df_train["lang"] == "te"]["question"].astype(str)
te_val_q   = df_val[df_val["lang"] == "te"]["question"].astype(str)

te_train_punct = Counter(ch for q in te_train_q for ch in PUNCT_RE.findall(q))
te_val_punct   = Counter(ch for q in te_val_q for ch in PUNCT_RE.findall(q))

print("Telugu — TRAIN punctuation (char -> count):")
print(te_train_punct.most_common())
print("Telugu — VAL punctuation (char -> count):")
print(te_val_punct.most_common())


Arabic — TRAIN punctuation (char -> count):
[('؟', 2556), ('"', 80), ('(', 25), (')', 25), ('-', 5), ('.', 2), ('/', 2), ('«', 2), ('»', 2), ('_', 2), ('\\', 1), ('—', 1), ('!', 1), ('،', 1)]
Arabic — VAL punctuation (char -> count):
[('؟', 413), ('"', 4), ('(', 3), (')', 3), ('،', 1), ('-', 1)]
Korean — TRAIN punctuation (char -> count):
[('?', 2420), (',', 23), ('.', 16), ("'", 6), ('"', 6), ('-', 5), (':', 2), ('/', 1), ('\\', 1), ('(', 1), (')', 1)]
Korean — VAL punctuation (char -> count):
[('?', 356), ('.', 9), (',', 3), ('-', 1)]
Telugu — TRAIN punctuation (char -> count):
[('?', 1355), ('.', 42), (',', 6), ('-', 3), ('%', 1), ('–', 1)]
Telugu — VAL punctuation (char -> count):
[('?', 384), ('.', 2), ('-', 1), ('%', 1)]


In [6]:

## Each language total words (not counting punctuation)
# tokenizer: split on \W+ (non-word chars); protect hyphens between letters/digits
# safeguard: build punctuation set from training+validation data, do not count these tokens as well

SPLIT_RE = re.compile(r"\W+", re.UNICODE)          # tokenizer
HY = "HYPHENJOIN"                                  # placeholder for protected hyphens
PROTECT_HYPHEN = re.compile(r"(?<=[\p{L}\p{N}])-(?=[\p{L}\p{N}])", re.UNICODE)  # hyphen between letters/digits

# ARABIC
ar_train_q = df_train[df_train["lang"] == "ar"]["question"].astype(str)
ar_val_q   = df_val[df_val["lang"] == "ar"]["question"].astype(str)

# build punctuation set (optional safeguard)
ar_punct_set = set(ch for q in pd.concat([ar_train_q, ar_val_q]) for ch in PUNCT_RE.findall(q))

# protect hyphens, split on \W+, restore hyphens; slashes will split
ar_train_tokens = []
for q in ar_train_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ar_punct_set]
    ar_train_tokens.extend(toks)

ar_val_tokens = []
for q in ar_val_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ar_punct_set]
    ar_val_tokens.extend(toks)

print("Arabic — TRAIN total words:", len(ar_train_tokens))
print("Arabic — VAL total words:",   len(ar_val_tokens))

# KOREAN
ko_train_q = df_train[df_train["lang"] == "ko"]["question"].astype(str)
ko_val_q   = df_val[df_val["lang"] == "ko"]["question"].astype(str)

ko_punct_set = set(ch for q in pd.concat([ko_train_q, ko_val_q]) for ch in PUNCT_RE.findall(q))

ko_train_tokens = []
for q in ko_train_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ko_punct_set]
    ko_train_tokens.extend(toks)

ko_val_tokens = []
for q in ko_val_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ko_punct_set]
    ko_val_tokens.extend(toks)

print("Korean — TRAIN total words:", len(ko_train_tokens))
print("Korean — VAL total words:",   len(ko_val_tokens))

# TELUGU
te_train_q = df_train[df_train["lang"] == "te"]["question"].astype(str)
te_val_q   = df_val[df_val["lang"] == "te"]["question"].astype(str)

te_punct_set = set(ch for q in pd.concat([te_train_q, te_val_q]) for ch in PUNCT_RE.findall(q))

te_train_tokens = []
for q in te_train_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in te_punct_set]
    te_train_tokens.extend(toks)

te_val_tokens = []
for q in te_val_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in te_punct_set]
    te_val_tokens.extend(toks)

print("Telugu — TRAIN total words:", len(te_train_tokens))
print("Telugu — VAL total words:",   len(te_val_tokens))


Arabic — TRAIN total words: 16199
Arabic — VAL total words: 2617
Korean — TRAIN total words: 11858
Korean — VAL total words: 1736
Telugu — TRAIN total words: 7690
Telugu — VAL total words: 2302


In [7]:
#Stats on numeric and hyphenated tokens 

# ---- After tokenization for Arabic ----
ar_numbers_train = sum(1 for t in ar_train_tokens if t.isdigit())
ar_numbers_val   = sum(1 for t in ar_val_tokens if t.isdigit())

ar_hyphen_train = sum(1 for t in ar_train_tokens if "-" in t)
ar_hyphen_val   = sum(1 for t in ar_val_tokens if "-" in t)

print("Arabic — numeric tokens (train):", ar_numbers_train)
print("Arabic — numeric tokens (val):",   ar_numbers_val)
print("Arabic — hyphenated tokens (train):", ar_hyphen_train)
print("Arabic — hyphenated tokens (val):",   ar_hyphen_val)

# ---- After tokenization for Korean ----
ko_numbers_train = sum(1 for t in ko_train_tokens if t.isdigit())
ko_numbers_val   = sum(1 for t in ko_val_tokens if t.isdigit())

ko_hyphen_train = sum(1 for t in ko_train_tokens if "-" in t)
ko_hyphen_val   = sum(1 for t in ko_val_tokens if "-" in t)

print("Korean — numeric tokens (train):", ko_numbers_train)
print("Korean — numeric tokens (val):",   ko_numbers_val)
print("Korean — hyphenated tokens (train):", ko_hyphen_train)
print("Korean — hyphenated tokens (val):",   ko_hyphen_val)

# ---- After tokenization for Telugu ----
te_numbers_train = sum(1 for t in te_train_tokens if t.isdigit())
te_numbers_val   = sum(1 for t in te_val_tokens if t.isdigit())

te_hyphen_train = sum(1 for t in te_train_tokens if "-" in t)
te_hyphen_val   = sum(1 for t in te_val_tokens if "-" in t)

print("Telugu — numeric tokens (train):", te_numbers_train)
print("Telugu — numeric tokens (val):",   te_numbers_val)
print("Telugu — hyphenated tokens (train):", te_hyphen_train)
print("Telugu — hyphenated tokens (val):",   te_hyphen_val)


Arabic — numeric tokens (train): 78
Arabic — numeric tokens (val): 11
Arabic — hyphenated tokens (train): 3
Arabic — hyphenated tokens (val): 0
Korean — numeric tokens (train): 9
Korean — numeric tokens (val): 1
Korean — hyphenated tokens (train): 5
Korean — hyphenated tokens (val): 1
Telugu — numeric tokens (train): 107
Telugu — numeric tokens (val): 39
Telugu — hyphenated tokens (train): 0
Telugu — hyphenated tokens (val): 0


In [8]:
#5 Most common words (not counting punctuation); with English translations and their count

translator = Translator()

# ARABIC 
# (skip pure numbers)
ar_counts = Counter([t.lower() for t in ar_train_tokens if t ])
ar_top5 = ar_counts.most_common(5)

print("Arabic — Top 5 most common words (TRAIN):")
for w, c in ar_top5:
    try:
        en = translator.translate(w, src='ar', dest='en').text
    except Exception as e:
        en = f"[translation error: {e}]"
    print(f"{w}\tcount={c}\t→ {en}")

# KOREAN 
ko_counts = Counter([t.lower() for t in ko_train_tokens if t ])
ko_top5 = ko_counts.most_common(5)

print("\nKorean — Top 5 most common words (TRAIN):")
for w, c in ko_top5:
    try:
        en = translator.translate(w, src='ko', dest='en').text
    except Exception as e:
        en = f"[translation error: {e}]"
    print(f"{w}\tcount={c}\t→ {en}")

#  TELUGU
te_counts = Counter([t.lower() for t in te_train_tokens if t])
te_top5 = te_counts.most_common(5)

print("\nTelugu — Top 5 most common words (TRAIN):")
for w, c in te_top5:
    try:
        en = translator.translate(w, src='te', dest='en').text
    except Exception as e:
        en = f"[translation error: {e}]"
    print(f"{w}\tcount={c}\t→ {en}")


Arabic — Top 5 most common words (TRAIN):
في	count=593	→ in
من	count=587	→ from
متى	count=536	→ when
ما	count=443	→ what
هو	count=350	→ he

Korean — Top 5 most common words (TRAIN):
가장	count=527	→ most
무엇인가	count=497	→ Something
언제	count=336	→ when
몇	count=234	→ some
어디인가	count=228	→ Where

Telugu — Top 5 most common words (TRAIN):
ఎవరు	count=274	→ Who is
ఏది	count=192	→ Which one is
ఎన్ని	count=165	→ How many
ఎప్పుడు	count=154	→ When
ఏ	count=144	→ A.


### We conclude the words are "stop words" that we learned in the lecture

In [13]:
# Stats about answerable vs unanswerable questions

# Define languages and splits

split_dfs = {
    "train": df_train,
    "val":   df_val
}


rows = []
for split_name, df in split_dfs.items():
    for lang in langs:
        total = df[df["lang"] == lang].shape[0]
        ans   = df[(df["lang"] == lang) & (df["answerable"])].shape[0]
        unans = total - ans
        ratio = ans / total if total > 0 else 0
        rows.append([split_name, lang, total, ans, unans, ratio])

# Create summary DataFrame
summary = pd.DataFrame(rows, columns=["Split", "Language", "Total", "Answerable", "Unanswerable", "Answerable Ratio"])
print(summary.to_string(index=False))


Split Language  Total  Answerable  Unanswerable  Answerable Ratio
train       ar   2558        2303           255          0.900313
train       ko   2422        2359            63          0.973988
train       te   1355        1310            45          0.966790
  val       ar    415         363            52          0.874699
  val       ko    356         337            19          0.946629
  val       te    384         291            93          0.757812


## RULE BASE CLASSIFIER

In [11]:
translation_cache = {}


translator = Translator()

def translate_to_en_cached(texts, src_lang, batch_size=50):
    out = []
    new_texts = [t for t in texts if t not in translation_cache]

    for i in range(0, len(new_texts), batch_size):
        batch = new_texts[i:i+batch_size]
        try:
            translations = translator.translate(batch, src=src_lang, dest='en')
            for t, trans in zip(batch, translations):
                translation_cache[t] = trans.text
        except Exception:
            for t in batch:
                translation_cache[t] = ""  

    
    out = [translation_cache[t] for t in texts]
    return out


In [17]:
#STOP WORDS
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) | set(string.punctuation)
print(stop_words)

{'hadn', 'these', 'if', 'below', 'isn', "should've", 'my', 'its', 'too', "she'll", "i'm", 'those', 'myself', 'both', 'themselves', 'himself', "shouldn't", "it'll", 'won', "you've", '%', "it'd", 'being', "couldn't", 'an', "you'll", 're', 'any', 'a', 'does', '_', "it's", 'some', "didn't", "isn't", 'off', "they'll", 'each', "i've", 'wasn', 'at', 'him', 'how', 'to', '$', "i'd", 'haven', 'hers', 'll', '@', 'because', 'needn', 'until', 'yourself', 'is', "needn't", "you're", 'ma', 'over', "they'd", 'this', 'been', 'through', '/', 'be', 'can', 'hasn', 'having', 'down', 'own', '[', "doesn't", 'but', 'doing', "he'll", 'do', ']', 'we', 'why', 'out', 'under', 'you', ':', 'ourselves', '.', 'm', 'have', 'between', "aren't", 'on', 'only', 'yours', '!', '`', 'your', 'as', 'with', 'all', '&', 'couldn', 'where', 'once', "you'd", 'shan', 'what', 'has', 'i', "mustn't", 'for', '>', '\\', "haven't", "'", 'she', 'again', 'y', 'their', "she'd", "he'd", 'before', 'just', 'weren', 'itself', 'while', 'which', 'b

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# ================== INSTALLS (run once if needed) ==================
# !pip install -q transformers sentencepiece sacremoses torch nltk regex unidecode

# ================== IMPORTS & SETUP ==================
import numpy as np, pandas as pd, regex as re, string
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

EN_STOP = set(stopwords.words('english')) | set(string.punctuation)

# ================== NLLB-200 (GPU) ==================
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_ID = "facebook/nllb-200-distilled-600M"
SRC_CODES = {"ar":"arb_Arab", "ko":"kor_Hang", "te":"tel_Telu"}
TGT_CODE = "eng_Latn"

device_str = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device_str)

tok = AutoTokenizer.from_pretrained(MODEL_ID)
mt  = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID).to(device_str).eval()

@torch.inference_mode()
def translate_list(texts, src_lang_code, batch_size=32, max_length=320):
    out = []
    for i in range(0, len(texts), batch_size):
        batch = [str(t) for t in texts[i:i+batch_size]]
        try:
            enc = tok(batch, return_tensors="pt", padding=True, truncation=True,
                      max_length=max_length, src_lang=src_lang_code).to(device_str)
            gen = mt.generate(**enc, forced_bos_token_id=tok.lang_code_to_id[TGT_CODE],
                              max_length=max_length)
            dec = tok.batch_decode(gen, skip_special_tokens=True)
        except Exception:
            dec = [""] * len(batch)
        out.extend(dec)
    return out

# ================== NLTK tokenize + stopword removal ==================
def tok_en_rm_stop(text: str):
    if not isinstance(text, str): return []
    toks = word_tokenize(text, language="english", preserve_line=True)  # avoids punkt model
    return [t.lower() for t in toks if t and t.lower() not in EN_STOP]

# ================== Feature builders ==================
ALNUM = re.compile(r"[a-z0-9]+")
def ascii_alnum(s):
    s2 = unidecode((s or "").lower())
    return " ".join(ALNUM.findall(s2))

def char_ngrams(s, n=3):
    s = s.replace(" ", "")
    return [s[i:i+n] for i in range(max(0, len(s)-n+1))]

NUM_RE = re.compile(r"\d{2,}")  # years/counts

def build_features(q_texts, c_texts, q_toks, c_toks):
    F = []
    for qx, cx, q, c in zip(q_texts, c_texts, q_toks, c_toks):
        qs, cs = set(q), set(c)
        inter = len(qs & cs)
        overlap = inter / max(1, len(q))
        jacc   = inter / max(1, len(qs | cs))

        # number match on raw translated strings
        q_nums = set(NUM_RE.findall(qx or ""))
        has_num  = 1.0 if any(n in (cx or "") for n in q_nums) else 0.0

        # long token anchor (>=6 chars) present in context tokens
        has_long = 1.0 if any((len(t)>=6 and t in cs) for t in q) else 0.0

        # char-3-gram overlap on raw ascii-folded strings
        qa, ca = ascii_alnum(qx or ""), ascii_alnum(cx or "")
        q3 = set(char_ngrams(qa, 3))
        char3 = (sum(g in ca for g in q3)/len(q3)) if q3 else 0.0

        F.append((overlap, jacc, has_num, has_long, char3))
    return np.array(F, float)

def score(F):
    # [overlap, jaccard, num, long, char3]
    w = np.array([0.35, 0.05, 0.20, 0.10, 0.30], float)
    return (F*w).sum(axis=1)

def best_threshold(scores, gold):
    best_t, best_f1 = 0.0, -1.0
    for T in np.linspace(0, 1, 101):
        pred = scores >= T
        tp = np.sum((pred==1)&(gold==1))
        fp = np.sum((pred==1)&(gold==0))
        fn = np.sum((pred==0)&(gold==1))
        prec = tp/(tp+fp) if (tp+fp) else 0.0
        rec  = tp/(tp+fn) if (tp+fn) else 0.0
        f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0
        if f1 > best_f1:
            best_f1, best_t = f1, T
    return float(best_t), float(best_f1)

# ================== Train→Val per-language evaluation ==================
def eval_language(lang_code):
    tr = df_train[df_train["lang"]==lang_code].copy()
    va = df_val[df_val["lang"]==lang_code].copy()

    # Translate Q & C to English
    src = SRC_CODES[lang_code]
    tr_q_en = translate_list(tr["question"].astype(str).tolist(), src)
    tr_c_en = translate_list(tr["context"].astype(str).tolist(),  src)
    va_q_en = translate_list(va["question"].astype(str).tolist(), src)
    va_c_en = translate_list(va["context"].astype(str).tolist(),  src)

    # Quick sanity check
    pct_empty = lambda L: np.mean([(not isinstance(x,str)) or (x.strip()=="") for x in L])*100
    print(f"[{lang_code}] emptyQ train/val: {pct_empty(tr_q_en):.2f}% / {pct_empty(va_q_en):.2f}%")

    # Tokenize (NLTK) + remove English stops
    tr_q_tok = [tok_en_rm_stop(s) for s in tr_q_en]
    tr_c_tok = [tok_en_rm_stop(s) for s in tr_c_en]
    va_q_tok = [tok_en_rm_stop(s) for s in va_q_en]
    va_c_tok = [tok_en_rm_stop(s) for s in va_c_en]

    # Features → scores
    F_tr = build_features(tr_q_en, tr_c_en, tr_q_tok, tr_c_tok)
    s_tr = score(F_tr)
    T, f1_tr = best_threshold(s_tr, tr["answerable"].astype(int).values)

    F_va = build_features(va_q_en, va_c_en, va_q_tok, va_c_tok)
    s_va = score(F_va)
    pred = (s_va >= T).astype(int)
    gold = va["answerable"].astype(int).values

    tp = int(((pred==1)&(gold==1)).sum())
    fp = int(((pred==1)&(gold==0)).sum())
    fn = int(((pred==0)&(gold==1)).sum())
    tn = int(((pred==0)&(gold==0)).sum())

    acc  = (tp+tn)/max(1, tp+tn+fp+fn)
    prec = tp/max(1, tp+fp)
    rec  = tp/max(1, tp+fn)
    f1   = 0.0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)

    return {
        "lang": lang_code,
        "threshold": round(T,3),
        "train_F1_at_T": round(f1_tr,4),
        "val_acc": round(acc,4),
        "val_prec": round(prec,4),
        "val_rec": round(rec,4),
        "val_f1": round(f1,4),
        "cm": {"TP":tp,"FP":fp,"FN":fn,"TN":tn},
    }

# ===== Run for Arabic / Korean / Telugu =====
results = [eval_language(l) for l in ["ar","ko","te"]]
summary = pd.DataFrame([{k:v for k,v in r.items() if k!="cm"} for r in results])
print("\n", summary.to_string(index=False))
print("\nConfusion matrices:")
for r in results:
    print(r["lang"], r["cm"])


Device: cuda
[ar] emptyQ train/val: 100.00% / 100.00%
[ko] emptyQ train/val: 100.00% / 100.00%
[te] emptyQ train/val: 100.00% / 100.00%

 lang  threshold  train_F1_at_T  val_acc  val_prec  val_rec  val_f1
  ar        0.0         0.9475   0.8747    0.8747      1.0  0.9332
  ko        0.0         0.9868   0.9466    0.9466      1.0  0.9726
  te        0.0         0.9831   0.7578    0.7578      1.0  0.8622

Confusion matrices:
ar {'TP': 363, 'FP': 52, 'FN': 0, 'TN': 0}
ko {'TP': 337, 'FP': 19, 'FN': 0, 'TN': 0}
te {'TP': 291, 'FP': 93, 'FN': 0, 'TN': 0}


In [5]:
# ================== DIAGNOSTICS CELL ==================

def debug_language(lang_code):
    tr = df_train[df_train["lang"]==lang_code].copy()
    src = SRC_CODES[lang_code]

    # Translate Q & C to English
    tr_q_en = translate_list(tr["question"].astype(str).tolist(), src)
    tr_c_en = translate_list(tr["context"].astype(str).tolist(), src)

    # Sanity: empty translation %
    def pct_empty(lst): 
        return np.mean([(not isinstance(x,str)) or (x.strip()=="") for x in lst])*100
    print(f"\n[{lang_code}] emptyQ train: {pct_empty(tr_q_en):.2f}% | emptyC train: {pct_empty(tr_c_en):.2f}%")

    # Tokenize (with current settings)
    tr_q_tok = [tok_en_rm_stop(s) for s in tr_q_en]
    tr_c_tok = [tok_en_rm_stop(s) for s in tr_c_en]

    # Features → scores
    F_tr = build_features(tr_q_en, tr_c_en, tr_q_tok, tr_c_tok)
    s_tr = score(F_tr)

    # Print stats
    print("Score stats (train): min", s_tr.min(), "max", s_tr.max(), "mean", s_tr.mean())
    print("Frac > 0:", (s_tr > 0).mean())

    names = ["overlap","jacc","has_num","has_long","char3"]
    print("Feature mins:", dict(zip(names, F_tr.min(axis=0))))
    print("Feature maxs:", dict(zip(names, F_tr.max(axis=0))))
    print("Feature means:",dict(zip(names, F_tr.mean(axis=0))))

    # Print sample translations
    print("\nSample translations:")
    for i in range(min(3, len(tr_q_en))):
        print(f"Q_SRC: {tr['question'].iloc[i]}")
        print(f"Q_EN : {tr_q_en[i]}")
        print(f"C_EN (first 100): {(tr_c_en[i] or '')[:100]}")
        print("---")

# Run for each language
for l in ["ar","ko","te"]:
    debug_language(l)



[ar] emptyQ train: 100.00% | emptyC train: 100.00%
Score stats (train): min 0.0 max 0.0 mean 0.0
Frac > 0: 0.0
Feature mins: {'overlap': np.float64(0.0), 'jacc': np.float64(0.0), 'has_num': np.float64(0.0), 'has_long': np.float64(0.0), 'char3': np.float64(0.0)}
Feature maxs: {'overlap': np.float64(0.0), 'jacc': np.float64(0.0), 'has_num': np.float64(0.0), 'has_long': np.float64(0.0), 'char3': np.float64(0.0)}
Feature means: {'overlap': np.float64(0.0), 'jacc': np.float64(0.0), 'has_num': np.float64(0.0), 'has_long': np.float64(0.0), 'char3': np.float64(0.0)}

Sample translations:
Q_SRC: متى تدخلت روسيا في  الحرب الأهلية السورية؟
Q_EN : 
C_EN (first 100): 
---
Q_SRC: متى حصلت هنغاريا على استقلالها من النمسا ؟
Q_EN : 
C_EN (first 100): 
---
Q_SRC: متى تحالفت فرنسا و بريطانيا العظمى ضد ألمانيا في حرب؟
Q_EN : 
C_EN (first 100): 
---

[ko] emptyQ train: 100.00% | emptyC train: 100.00%
Score stats (train): min 0.0 max 0.0 mean 0.0
Frac > 0: 0.0
Feature mins: {'overlap': np.float64(0.0), 'ja

In [1]:
# ================== IMPORTS & SETUP ==================
import numpy as np, pandas as pd, regex as re, string
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

EN_STOP = set(stopwords.words('english')) | set(string.punctuation)

# ================== NLLB-200 (GPU) ==================
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_ID = "facebook/nllb-200-distilled-600M"
SRC_CODES = {"ar":"arb_Arab", "ko":"kor_Hang", "te":"tel_Telu"}
TGT_CODE = "eng_Latn"

device_str = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device_str)

Device: cuda


In [6]:
# ===== FIX & VERIFY TRANSLATION (NLLB-200) =====
# If you just installed CUDA/PyTorch, RESTART kernel before running this.

import numpy as np, pandas as pd
import torch
from transformers import pipeline

print("PyTorch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
device = 0 if torch.cuda.is_available() else -1
print("Using device idx for HF pipeline:", device)

MODEL_ID = "facebook/nllb-200-distilled-600M"
SRC_CODES = {"ar": "arb_Arab", "ko": "kor_Hang", "te": "tel_Telu"}
TGT_CODE = "eng_Latn"

# Build a single multilingual translation pipeline
nllb = pipeline("translation", model=MODEL_ID, tokenizer=MODEL_ID, device=device)

def translate_list_pipe(texts, src_lang, batch_size=32, max_length=320):
    """Robust batched translation via HF pipeline, with explicit src/tgt codes and no silent failures."""
    out = []
    for i in range(0, len(texts), batch_size):
        batch = [str(x) if isinstance(x, str) else "" for x in texts[i:i+batch_size]]
        try:
            preds = nllb(batch, src_lang=SRC_CODES[src_lang], tgt_lang=TGT_CODE,
                         truncation=True, max_length=max_length)
            out.extend([p["translation_text"] for p in preds])
        except Exception as e:
            print(f"[WARN] Translation failed for batch {i}:{i+len(batch)} — {type(e).__name__}: {e}")
            out.extend([""] * len(batch))
    return out

def pct_empty(lst):
    return np.mean([(not isinstance(x,str)) or (x.strip()=="") for x in lst]) * 100.0

# --- 1) Smoke-test the model with known strings (should NOT be empty) ---
tests = {
    "ar": ["مرحبا", "متى تأسست جامعة القاهرة؟"],
    "ko": ["안녕하세요", "서울의 인구는 얼마입니까?"],
    "te": ["నమస్కారం", "హైదరాబాదు ఎక్కడ ఉంది?"],
}
for lg, arr in tests.items():
    tr = translate_list_pipe(arr, lg, batch_size=2)
    print(f"\nSmoke test [{lg}]")
    for src, tgt in zip(arr, tr):
        print("SRC:", src)
        print("EN :", tgt)

# --- 2) Sanity-check a small slice from your dataset (questions) ---
SAMPLE = 16  # small to be fast; raise after it works
for lg in ["ar","ko","te"]:
    q_src = df_train.loc[df_train["lang"]==lg, "question"].astype(str).head(SAMPLE).tolist()
    q_en  = translate_list_pipe(q_src, lg, batch_size=8)
    print(f"\n[{lg}] sample translation empties (questions): {pct_empty(q_en):.2f}%  on {len(q_en)} items")
    # peek a couple
    for i in range(min(2, len(q_en))):
        print("Q_SRC:", q_src[i])
        print("Q_EN :", q_en[i])
        print("---")

# --- 3) Contexts in TyDi XOR RC are already English; translating them is unnecessary & slow.
#       If your assignment requires translating both, you can do it — but we recommend skipping for speed.
TRANSLATE_CONTEXT = False  # set True only if you must translate contexts too

if TRANSLATE_CONTEXT:
    for lg in ["ar","ko","te"]:
        c_src = df_train.loc[df_train["lang"]==lg, "context"].astype(str).head(SAMPLE).tolist()
        c_en  = translate_list_pipe(c_src, lg, batch_size=8)
        print(f"\n[{lg}] sample translation empties (contexts): {pct_empty(c_en):.2f}%  on {len(c_en)} items")
        for i in range(min(1, len(c_en))):
            print("C_SRC (first 150):", c_src[i][:150])
            print("C_EN  (first 150):", c_en[i][:150])
            print("---")
else:
    print("\n[Info] Skipping context translation (contexts are already English in this dataset).")


PyTorch: 2.7.1+cu118
CUDA available: True
Using device idx for HF pipeline: 0


Device set to use cuda:0



Smoke test [ar]
SRC: مرحبا
EN : Hey , what 's up ?
SRC: متى تأسست جامعة القاهرة؟
EN : When was Cairo University founded?

Smoke test [ko]
SRC: 안녕하세요
EN : Hey, what's up?
SRC: 서울의 인구는 얼마입니까?
EN : What is the population of Seoul?

Smoke test [te]
SRC: నమస్కారం
EN : Greetings
SRC: హైదరాబాదు ఎక్కడ ఉంది?
EN : Where is Hyderabad?

[ar] sample translation empties (questions): 0.00%  on 16 items
Q_SRC: متى تدخلت روسيا في  الحرب الأهلية السورية؟
Q_EN : When did Russia intervene in the Syrian civil war?
---
Q_SRC: متى حصلت هنغاريا على استقلالها من النمسا ؟
Q_EN : When did Hungary gain its independence from Austria ?
---

[ko] sample translation empties (questions): 0.00%  on 16 items
Q_SRC: 30년 전쟁의 승자는 누구인가?
Q_EN : Who is the winner of the Thirty Years' War?
---
Q_SRC: 엑스선은 누가 발견하였는가?
Q_EN : Who discovered X-rays?
---

[te] sample translation empties (questions): 0.00%  on 16 items
Q_SRC: ప్రపంచంలో  మొట్టమొదటి దూర విద్య విద్యాలయం ఏ దేశంలో స్థాపించబడింది ?
Q_EN : The world's first distance learn

In [None]:
# ================== RULE-BASED CLASSIFIER (with CUDA NLLB Q-translation) ==================
import numpy as np, pandas as pd, regex as re, string
from unidecode import unidecode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

EN_STOP = set(stopwords.words('english')) | set(string.punctuation)

# ---- tokenization helpers ----
def tok_en_rm_stop(text: str):
    if not isinstance(text, str): return []
    toks = word_tokenize(text, language="english", preserve_line=True)
    return [t.lower() for t in toks if t and t.lower() not in EN_STOP]

def tok_en_keep_all(text: str):
    if not isinstance(text, str): return []
    toks = word_tokenize(text, language="english", preserve_line=True)
    return [t.lower() for t in toks if t]

# ---- text normalization / features ----
ALNUM = re.compile(r"[a-z0-9]+")
def ascii_alnum(s):
    s2 = unidecode((s or "").lower())
    return " ".join(ALNUM.findall(s2))

def char_ngrams(s, n=3):
    s = s.replace(" ", "")
    return [s[i:i+n] for i in range(max(0, len(s)-n+1))]

NUM_RE = re.compile(r"\d{2,}")  # match 2+ digit numbers (years, counts)

def build_features(q_texts, c_texts, q_toks, c_toks):
    """
    Return Nx5 features: [overlap, jaccard, has_num, has_long, char3]
    """
    F = []
    for qx, cx, q, c in zip(q_texts, c_texts, q_toks, c_toks):
        qs, cs = set(q), set(c)
        inter = len(qs & cs)
        overlap = inter / max(1, len(q))
        jacc   = inter / max(1, len(qs | cs))

        # numbers on RAW strings
        q_nums = set(NUM_RE.findall(qx or ""))
        has_num  = 1.0 if any(n in (cx or "") for n in q_nums) else 0.0

        # long token anchor (>=5 chars) present in context tokens
        has_long = 1.0 if any((len(t) >= 5 and t in cs) for t in q) else 0.0

        # char-3-gram overlap on ascii-folded RAW strings
        qa, ca = ascii_alnum(qx or ""), ascii_alnum(cx or "")
        q3 = set(char_ngrams(qa, 3))
        char3 = (sum(g in ca for g in q3)/len(q3)) if q3 else 0.0

        F.append((overlap, jacc, has_num, has_long, char3))
    return np.array(F, float)

def score(F):
    # weights: emphasize token overlap + char3; support from numbers/long; small jaccard
    w = np.array([0.40, 0.05, 0.20, 0.10, 0.25], float)
    return (F*w).sum(axis=1)

def best_threshold(scores, gold):
    best_t, best_f1 = 0.0, -1.0
    for T in np.linspace(0, 1, 101):
        pred = scores >= T
        tp = np.sum((pred==1)&(gold==1))
        fp = np.sum((pred==1)&(gold==0))
        fn = np.sum((pred==0)&(gold==1))
        prec = tp/(tp+fp) if (tp+fp) else 0.0
        rec  = tp/(tp+fn) if (tp+fn) else 0.0
        f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0
        if f1 > best_f1:
            best_f1, best_t = f1, T
    return float(best_t), float(best_f1)

def eval_language_rule_based(lang_code, translate_contexts=False, batch_size=64):
    # 1) slice data
    tr = df_train[df_train["lang"]==lang_code].copy()
    va = df_val[df_val["lang"]==lang_code].copy()

    # 2) translate questions → EN (uses working nllb+translate_list_pipe from previous cell)
    tr_q_en = translate_list_pipe(tr["question"].astype(str).tolist(), lang_code, batch_size=batch_size)
    va_q_en = translate_list_pipe(va["question"].astype(str).tolist(), lang_code, batch_size=batch_size)

    # 3) contexts: either keep as-is (English already) or translate if you must
    if translate_contexts:
        tr_c_en = translate_list_pipe(tr["context"].astype(str).tolist(), lang_code, batch_size=batch_size)
        va_c_en = translate_list_pipe(va["context"].astype(str).tolist(), lang_code, batch_size=batch_size)
    else:
        tr_c_en = tr["context"].astype(str).tolist()
        va_c_en = va["context"].astype(str).tolist()

    # 4) tokenize: remove stops in Q, keep all tokens in C (better recall)
    tr_q_tok = [tok_en_rm_stop(s) for s in tr_q_en]
    va_q_tok = [tok_en_rm_stop(s) for s in va_q_en]
    tr_c_tok = [tok_en_keep_all(s) for s in tr_c_en]
    va_c_tok = [tok_en_keep_all(s) for s in va_c_en]

    # 5) features → scores → threshold on train
    F_tr = build_features(tr_q_en, tr_c_en, tr_q_tok, tr_c_tok)
    s_tr = score(F_tr)
    T, f1_tr = best_threshold(s_tr, tr["answerable"].astype(int).values)

    # 6) eval on val
    F_va = build_features(va_q_en, va_c_en, va_q_tok, va_c_tok)
    s_va = score(F_va)
    pred = (s_va >= T).astype(int)
    gold = va["answerable"].astype(int).values

    tp = int(((pred==1)&(gold==1)).sum())
    fp = int(((pred==1)&(gold==0)).sum())
    fn = int(((pred==0)&(gold==1)).sum())
    tn = int(((pred==0)&(gold==0)).sum())

    acc  = (tp+tn)/max(1, tp+tn+fp+fn)
    prec = tp/max(1, tp+fp)
    rec  = tp/max(1, tp+fn)
    f1   = 0.0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)

    return {
        "lang": lang_code,
        "threshold": round(T,3),
        "train_F1_at_T": round(f1_tr,4),
        "val_acc": round(acc,4),
        "val_prec": round(prec,4),
        "val_rec": round(rec,4),
        "val_f1": round(f1,4),
        "cm": {"TP":tp,"FP":fp,"FN":fn,"TN":tn},
        "score_min_max_train": (float(s_tr.min()), float(s_tr.max())),
        "score_min_max_val": (float(s_va.min()), float(s_va.max())),
    }

# ---- run for all three languages
results = [eval_language_rule_based(l, translate_contexts=False, batch_size=64) for l in ["ar","ko","te"]]

summary = pd.DataFrame([{k:v for k,v in r.items() if k not in ("cm","score_min_max_train","score_min_max_val")} for r in results])
print(summary.to_string(index=False))

print("\nConfusion matrices:")
for r in results:
    print(r["lang"], r["cm"])

print("\nScore ranges:")
for r in results:
    print(r["lang"], "train", r["score_min_max_train"], "| val", r["score_min_max_val"])


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
