###WEEK 38

In [None]:
from datasets import load_dataset
import pandas as pd, numpy as np, re, unicodedata, torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.pipeline import FeatureUnion
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
import math

In [None]:
from datasets import load_dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = dataset["train"].to_pandas()
df_val = dataset["validation"].to_pandas()

print(df_train.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/6.88M [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/4.80k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15343 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3011 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4 [00:00<?, ? examples/s]

                                            question  \
0  উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...   
1           দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?   
2  মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...   
3  আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...   
4          বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?   

                                             context lang  answerable  \
0  WikiLeaks () is an international non-profit or...   bn        True   
1  The war in Europe concluded with an invasion o...   bn        True   
2  Same-sex marriage in the United States expande...   bn       False   
3  The exact number of Arab casualties is unknown...   bn        True   
4  As Thomas Hall (2000) notes, "The Sung Empire ...   bn        True   

   answer_start        answer answer_inlang  
0           182          2006          None  
1            48       Germany          None  
2            -1            no          None  
3            39       unknown          N

In [None]:
SEED = 42
np.random.seed(SEED)

LANGS = ["ar", "ko", "te"]

NLLB_MODEL = "facebook/nllb-200-distilled-600M"

NLLB_CODES = {
    "ar": "arb_Arab",
    "ko": "kor_Hang",
    "te": "tel_Telu",
    "en": "eng_Latn",
}

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

Using device: cuda


In [None]:
tok = AutoTokenizer.from_pretrained(NLLB_MODEL)
mt  = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL).to(DEVICE)

def _translate_batch(sentences, src_code, max_new_tokens=128):
    tok.src_lang = src_code
    eng_id = tok.convert_tokens_to_ids(NLLB_CODES["en"])
    enc = tok(sentences, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
    with torch.no_grad():
        gen = mt.generate(**enc, forced_bos_token_id=eng_id, max_new_tokens=max_new_tokens)
    out = tok.batch_decode(gen, skip_special_tokens=True)
    return [re.sub(r"\s+", " ", s).strip() for s in out]

def translate_series_to_en(series, src_code, batch_size=16):
    out = []
    for i in range(0, len(series), batch_size):
        chunk = ["" if x is None else str(x) for x in series.iloc[i:i+batch_size].tolist()]
        out.extend(_translate_batch(chunk, src_code))
    return pd.Series(out, index=series.index)


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

## Bag of words + Logistic Regression


In [None]:
def normalize_text(x):
    if x is None:
        return ""
    x = unicodedata.normalize("NFKC", str(x))
    x = re.sub(r"\s+", " ", x).strip()
    return x

In [None]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")  # required by newer NLTK

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def nltk_tokenizer(doc: str):
    return word_tokenize(doc)

def run_logreg_bows(df_train, df_val):
    X_train = (df_train["question_en"] + " [SEP] " + df_train["context_en"]).fillna("")
    X_val   = (df_val["question_en"]   + " [SEP] " + df_val["context_en"]).fillna("")
    y_train = df_train["label"].astype(int).values
    y_val   = df_val["label"].astype(int).values

    vec = TfidfVectorizer(
        analyzer="word",
        tokenizer=nltk_tokenizer,   # NLTK
        token_pattern=None,         # silence warning
        lowercase=True,
        strip_accents="unicode",
        ngram_range=(1,2),
        min_df=2,
        max_features=200_000,
    )
    Xtr = vec.fit_transform(X_train)
    Xva = vec.transform(X_val)

    clf = LogisticRegression(solver="liblinear", class_weight="balanced", max_iter=2000, n_jobs=-1)
    clf.fit(Xtr, y_train)

    y_pred = clf.predict(Xva)
    y_prob = clf.predict_proba(Xva)[:, 1]

    return {"y_true": y_val, "y_pred": y_pred, "y_prob": y_prob}


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:


def pick_threshold(y_true, y_prob, mode="f1"):
    best_t, best_score = 0.5, -1
    for t in np.linspace(0.05, 0.95, 19):
        y_hat = (y_prob >= t).astype(int)
        if mode == "youden":
            tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
            tpr = tp/(tp+fn) if (tp+fn) else 0.0
            fpr = fp/(fp+tn) if (fp+tn) else 0.0
            score = tpr - fpr  # Youden's J
        else:  # macro-F1
            score = f1_score(y_true, y_hat, average="macro")
        if score > best_score:
            best_score, best_t = score, t
    return best_t

summary_rows = []

for lang in LANGS:
    df_train_lang = df_train[df_train["lang"] == lang].copy()
    df_val_lang   = df_val[df_val["lang"] == lang].copy()
    if len(df_val_lang) == 0:
        continue

    # labels + text
    df_train_lang["label"] = df_train_lang["answerable"].astype(int)
    df_val_lang["label"]   = df_val_lang["answerable"].astype(int)
    df_train_lang["context_en"] = df_train_lang["context"].astype(str).apply(normalize_text)
    df_val_lang["context_en"]   = df_val_lang["context"].astype(str).apply(normalize_text)
    src_code = NLLB_CODES[lang]
    df_train_lang["question_en"] = translate_series_to_en(df_train_lang["question"].astype(str), src_code)
    df_val_lang["question_en"]   = translate_series_to_en(df_val_lang["question"].astype(str),   src_code)

    # train & get probabilities
    pack = run_logreg_bows(df_train_lang, df_val_lang)
    y_true, y_prob = pack["y_true"], pack["y_prob"]

    # tune threshold (try mode="youden" if you prefer balanced TPR/FPR)
    best_t = pick_threshold(y_true, y_prob, mode="f1")

    # recompute metrics at tuned threshold
    y_hat = (y_prob >= best_t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
    acc = (tp + tn) / (tp + tn + fp + fn)
    tpr = tp / (tp + fn) if (tp + fn) else 0.0
    fpr = fp / (fp + tn) if (fp + tn) else 0.0
    tnr = tn / (tn + fp) if (tn + fp) else 0.0
    f1m = f1_score(y_true, y_hat, average="macro")

    summary_rows.append({
        "Language": lang.upper(),
        "τ (threshold)": round(best_t, 2),
        "Accuracy": f"{acc*100:.1f}%",
        "TPR": f"{tpr*100:.1f}%",
        "FPR": f"{fpr*100:.1f}%",
        "TNR": f"{tnr*100:.1f}%",
       ## "Macro-F1": f"{f1m*100:.1f}%"
    })

summary_df_tuned = pd.DataFrame(summary_rows)
summary_df_tuned




Unnamed: 0,Language,τ (threshold),Accuracy,TPR,FPR,TNR
0,AR,0.5,86.5%,93.1%,59.6%,40.4%
1,KO,0.6,93.5%,97.6%,78.9%,21.1%
2,TE,0.6,78.4%,88.7%,53.8%,46.2%


## XLM-RoBERTa

In [None]:
import inspect, torch, numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from sklearn.metrics import confusion_matrix, f1_score
from datasets import load_dataset, Dataset as HFDataset
import os

np.random.seed(SEED); torch.manual_seed(SEED)

MODEL_NAME = "xlm-roberta-base"
MAX_LEN = 384

os.environ["WANDB_DISABLED"] = "true"   # hard disable
os.environ["WANDB_MODE"] = "disabled"   # legacy env var
os.environ["WANDB_SILENT"] = "true"     # suppress messages



In [None]:

def add_labels(df):
    df = df.copy()
    df["label"] = df["answerable"].astype(int)
    return df

def sweep_thresholds(y_true, y_prob, mode="f1"):
    """Return best threshold in [0.05..0.95] by Macro-F1 (default) or Youden's J."""
    best_t, best_score = 0.5, -1
    for t in np.linspace(0.05, 0.95, 19):
        y_hat = (y_prob >= t).astype(int)
        if mode == "youden":
            tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
            tpr = tp/(tp+fn) if (tp+fn) else 0.0
            fpr = fp/(fp+tn) if (fp+tn) else 0.0
            score = tpr - fpr
        else:
            score = f1_score(y_true, y_hat, average="macro")
        if score > best_score:
            best_score, best_t = score, t
    return float(best_t)

def metrics_at_threshold(y_true, y_prob, t):
    y_hat = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    tpr = tp/(tp+fn) if (tp+fn) else 0.0
    fpr = fp/(fp+tn) if (fp+tn) else 0.0
    tnr = tn/(tn+fp) if (tn+fp) else 0.0
    f1m = f1_score(y_true, y_hat, average="macro")
    return acc, tpr, fpr, tnr, f1m


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def make_hf_datasets_for_lang(df_train, df_val, lang):
    # filter this language
    dtr = df_train[df_train["lang"] == lang].copy()
    dva = df_val[df_val["lang"] == lang].copy()
    if len(dtr) == 0 or len(dva) == 0:
        return None, None

    # add numeric labels
    dtr = add_labels(dtr)
    dva = add_labels(dva)

    # Hugging Face Datasets
    hf_tr = HFDataset.from_pandas(dtr[["question","context","label"]], preserve_index=False)
    hf_va = HFDataset.from_pandas(dva[["question","context","label"]], preserve_index=False)

    # tokenize pair: (question, context). Trim context first.
    def tok(batch):
      return tokenizer(
          batch["question"], batch["context"],
          truncation="only_second",
          max_length=MAX_LEN,
          padding="max_length"   # ensures fixed-length batches
      )

    hf_tr = hf_tr.map(tok, batched=True)
    hf_va = hf_va.map(tok, batched=True)

    keep_cols = ["input_ids","attention_mask","label"]
    hf_tr = hf_tr.remove_columns([c for c in hf_tr.column_names if c not in keep_cols])
    hf_va = hf_va.remove_columns([c for c in hf_va.column_names if c not in keep_cols])

    hf_tr.set_format("torch")
    hf_va.set_format("torch")
    return hf_tr, hf_va


In [None]:


accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1m = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    return {"accuracy": acc, "macro_f1": f1m}

def train_xlmr(hf_tr, hf_va, output_dir):
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(DEVICE)

    # Build minimal, old-version-friendly TrainingArguments
    args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=50,
        seed=SEED,
        # keep it simple: omit evaluation_strategy/save_strategy/load_best_model_at_end/report_to/metric_for_best_model
        fp16=torch.cuda.is_available(),
    )

    # If DataCollatorWithPadding exists, use it; otherwise rely on padding='max_length' in tokenization
    try:
        from transformers import DataCollatorWithPadding
        collator = DataCollatorWithPadding(tokenizer)
    except Exception:
        collator = None  # fallback; make sure your tokenize() used padding='max_length'

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=hf_tr,
        eval_dataset=hf_va,
        data_collator=collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_res = trainer.evaluate()  # run eval after training

    # Probabilities for threshold tuning
    preds = trainer.predict(hf_va)
    logits = preds.predictions
    y_true = preds.label_ids
    y_prob = torch.softmax(torch.tensor(logits), dim=1)[:, 1].cpu().numpy()

    return trainer, eval_res, y_true, y_prob


In [None]:

summary_rows_tuned = []

for lang in LANGS:

    hf_tr, hf_va = make_hf_datasets_for_lang(df_train, df_val, lang)
    if hf_tr is None:
        print(f"[WARN] No data for {lang}, skipping.")
        continue

    trainer, eval_res, y_true, y_prob = train_xlmr(hf_tr, hf_va, output_dir=f"xlmr_{lang}")

    # Tuned thresholds (report Macro-F1-optimal; you can also try Youden)
    t_best = sweep_thresholds(y_true, y_prob, mode="f1")
    acc, tpr, fpr, tnr, f1m = metrics_at_threshold(y_true, y_prob, t_best)
    summary_rows_tuned.append({
        "Language": lang.upper(),
        #"Macro-F1": round(t_best, 2),
        "Accuracy": f"{acc*100:.1f}%",
        "TPR": f"{tpr*100:.1f}%",
        "FPR": f"{fpr*100:.1f}%",
        "TNR": f"{tnr*100:.1f}%",
       # "Macro-F1": f"{f1m*100:.1f}%"
    })

    # free memory between languages
    del trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


df_tuned   = pd.DataFrame(summary_rows_tuned)

display(df_tuned)





Map:   0%|          | 0/2558 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
50,0.3716
100,0.3244
150,0.3512
200,0.2872
250,0.3211
300,0.2909
350,0.2067
400,0.1056
450,0.1053





Map:   0%|          | 0/2422 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
50,0.2213
100,0.1275
150,0.1196
200,0.1431
250,0.1373
300,0.1039
350,0.1139
400,0.1404
450,0.0723





Map:   0%|          | 0/1355 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
50,0.2479
100,0.1157
150,0.1652
200,0.1412
250,0.146


Unnamed: 0,Language,τ (Macro-F1),Accuracy,TPR,FPR,TNR,Macro-F1
0,AR,0.95,98.1%,98.3%,3.8%,96.2%,95.7%
1,KO,0.95,94.7%,98.2%,68.4%,31.6%,68.0%
2,TE,0.95,79.9%,97.3%,74.2%,25.8%,63.2%


In [None]:
for lang in LANGS: ##Class imbalance
    sub_tr = df_train[df_train["lang"]==lang]
    sub_va = df_val[df_val["lang"]==lang]
    if len(sub_tr)==0: continue
    print(f"{lang.upper()} | train N={len(sub_tr)} pos={sub_tr['answerable'].mean():.1%} | "
          f"val N={len(sub_va)} pos={sub_va['answerable'].mean():.1%}")


AR | train N=2558 pos=90.0% | val N=415 pos=87.5%
KO | train N=2422 pos=97.4% | val N=356 pos=94.7%
TE | train N=1355 pos=96.7% | val N=384 pos=75.8%


In [None]:
## Weighted trained version

import os, numpy as np, torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

# use the same tokenizer as before; create if missing
try:
    tokenizer
except NameError:
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

acc_metric = evaluate.load("accuracy")
f1_metric  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        "macro_f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
    }



class WeightedCETrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = (
            torch.tensor(class_weights, dtype=torch.float) if class_weights is not None else None
        )

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Hugging Face may pass extra kwargs like num_items_in_batch; we just ignore them.
        # Pull labels safely (could be 'labels' or 'label' depending on collator/version)
        if "labels" in inputs:
            labels = inputs.pop("labels")
        elif "label" in inputs:  # very old versions
            labels = inputs.pop("label")
        else:
            raise KeyError("No 'labels' key found in inputs for loss computation.")

        outputs = model(**inputs)          # forward pass
        logits = outputs.logits

        weight = self.class_weights.to(logits.device) if self.class_weights is not None else None
        loss = torch.nn.functional.cross_entropy(logits, labels, weight=weight)

        return (loss, outputs) if return_outputs else loss



def get_class_weights(hf_train):
    y = np.array(hf_train["label"])
    n0, n1 = (y==0).sum(), (y==1).sum()
    N = max(n0+n1, 1)
    # inverse-frequency style; sums ~2.0 → numerically stable
    return [N/(2*max(n0,1)), N/(2*max(n1,1))]

def train_xlmr_weighted(hf_tr, hf_va, output_dir):
    model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)

    args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=50,
        seed=42,
        fp16=torch.cuda.is_available(),
    )

    # dynamic class weights from the train split
    cw = get_class_weights(hf_tr)

    # simple padding collator (handle old/new transformers)
    try:
        from transformers import DataCollatorWithPadding
        collator = DataCollatorWithPadding(tokenizer)
    except Exception:
        collator = None

    trainer = WeightedCETrainer(
        class_weights=cw,
        model=model, args=args,
        train_dataset=hf_tr, eval_dataset=hf_va,
        data_collator=collator, tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    trainer.train()
    _ = trainer.evaluate()

    preds = trainer.predict(hf_va)
    logits = preds.predictions
    y_true = preds.label_ids
    y_prob = torch.softmax(torch.tensor(logits), dim=1)[:, 1].cpu().numpy()
    return trainer, y_true, y_prob


In [None]:

summary_rows_weighted = []

for lang in LANGS:
    hf_tr, hf_va = make_hf_datasets_for_lang(df_train, df_val, lang)
    if hf_tr is None:
        continue

    trainer_w, y_true_w, y_prob_w = train_xlmr_weighted(hf_tr, hf_va, output_dir=f"xlmr_weighted_{lang}")

    # tune by Macro-F1 (same criterion you used for df_tuned)
    t_f1 = sweep_thresholds(y_true_w, y_prob_w, mode="f1")
    acc, tpr, fpr, tnr, f1m = metrics_at_threshold(y_true_w, y_prob_w, t_f1)

    summary_rows_weighted.append({
        "Language": lang.upper(),
        #"Macro-F1_w": round(t_f1, 2),
        "Accuracy_w": f"{acc*100:.1f}%",
        "TPR_w": f"{tpr*100:.1f}%",
        "FPR_w": f"{fpr*100:.1f}%",
        "TNR_w": f"{tnr*100:.1f}%",
       # "Macro-F1_w": f"{f1m*100:.1f}%"
    })

    # free VRAM between langs
    del trainer_w
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

df_weighted = pd.DataFrame(summary_rows_weighted)

# Merge new (weighted) vs your existing df_tuned (unweighted)
try:
    comp = df_tuned.merge(df_weighted, on="Language", how="outer", suffixes=("_orig", "_w"))
except NameError:
    # if df_tuned isn't in scope, just show weighted
    comp = df_weighted.copy()

display(comp)


Map:   0%|          | 0/2558 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.7152
100,0.6634
150,0.7322
200,0.4156
250,0.1603
300,0.1234
350,0.1754
400,0.1273
450,0.2508


Map:   0%|          | 0/2422 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.9494
100,1.0611
150,1.1589
200,1.2563
250,1.0654
300,0.928
350,1.0713
400,1.3407
450,0.7741


Map:   0%|          | 0/1355 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  super().__init__(*args, **kwargs)


Step,Training Loss
50,0.803
100,0.8232
150,1.2149
200,0.7897
250,1.147


Unnamed: 0,Language,τ (Macro-F1),Accuracy,TPR,FPR,TNR,Macro-F1,τ_w(Macro-F1),Accuracy_w,TPR_w,FPR_w,TNR_w,Macro-F1_w
0,AR,0.95,98.1%,98.3%,3.8%,96.2%,95.7%,0.95,97.6%,98.3%,7.7%,92.3%,94.6%
1,KO,0.95,94.7%,98.2%,68.4%,31.6%,68.0%,0.05,94.7%,100.0%,100.0%,0.0%,48.6%
2,TE,0.95,79.9%,97.3%,74.2%,25.8%,63.2%,0.95,78.4%,100.0%,89.2%,10.8%,53.5%


## CNN classifier (Learned word embedding)


In [None]:
MAX_TOKENS = 256   # sequence length for CNN

def normalize_text(x):
    if x is None or (isinstance(x, float) and math.isnan(x)): return ""
    s = unicodedata.normalize("NFKC", str(x))
    return re.sub(r"\s+", " ", s).strip()

# threshold utilities (reuse if you already have them)
def sweep_thresholds(y_true, y_prob, mode="f1"):
    best_t, best_score = 0.5, -1
    for t in np.linspace(0.05, 0.95, 19):
        y_hat = (y_prob >= t).astype(int)
        if mode == "youden":
            tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
            tpr = tp/(tp+fn) if (tp+fn) else 0.0
            fpr = fp/(fp+tn) if (fp+tn) else 0.0
            score = tpr - fpr
        else:
            score = f1_score(y_true, y_hat, average="macro")
        if score > best_score:
            best_score, best_t = score, t
    return float(best_t)

def metrics_at_threshold(y_true, y_prob, t):
    y_hat = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
    acc = (tp+tn)/(tp+tn+fp+fn)
    tpr = tp/(tp+fn) if (tp+fn) else 0.0
    fpr = fp/(fp+tn) if (fp+tn) else 0.0
    tnr = tn/(tn+fp) if (tn+fp) else 0.0
    f1m = f1_score(y_true, y_hat, average="macro")
    return acc, tpr, fpr, tnr, f1m

In [None]:
try:
    translate_series_to_en, NLLB_CODES
except NameError:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    NLLB_MODEL_ID = "facebook/nllb-200-distilled-600M"
    NLLB_CODES = {"ar":"arb_Arab","ko":"kor_Hang","te":"tel_Telu","en":"eng_Latn"}

    _tok_mt = AutoTokenizer.from_pretrained(NLLB_MODEL_ID)
    _mt = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL_ID).to(DEVICE)

    def _translate_batch(sentences, src_code, max_new_tokens=128):
        _tok_mt.src_lang = src_code
        eng_id = _tok_mt.convert_tokens_to_ids(NLLB_CODES["en"])
        enc = _tok_mt(sentences, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
        with torch.no_grad():
            gen = _mt.generate(**enc, forced_bos_token_id=eng_id, max_new_tokens=max_new_tokens)
        out = _tok_mt.batch_decode(gen, skip_special_tokens=True)
        return [re.sub(r"\s+"," ", s).strip() for s in out]

    def translate_series_to_en(series, src_code, batch_size=16):
        out=[]
        for i in range(0, len(series), batch_size):
            chunk = ["" if x is None else str(x) for x in series.iloc[i:i+batch_size].tolist()]
            out.extend(_translate_batch(chunk, src_code))
        return pd.Series(out, index=series.index)


In [None]:
# NLTK tokenizer setup
# !pip install -q nltk
from nltk.tokenize import ToktokTokenizer
toktok = ToktokTokenizer()

# If you prefer word_tokenize instead, uncomment BOTH lines below once:
import nltk; nltk.download("punkt"); nltk.download("punkt_tab")
from nltk.tokenize import word_tokenize

def nltk_tokenize(text: str):
    # return word_tokenize(text.lower())   # <-- use this if you chose word_tokenize
    return toktok.tokenize(str(text).lower())  # Toktok: no downloads needed

# ---------- vocab & encoding WITH a real <sep> token ----------
from collections import Counter
import numpy as np
import re

SPECIALS = ["<pad>", "<unk>", "<sep>"]  # indices will be 0,1,2 respectively

def build_vocab_pairs(questions, contexts, min_freq=2, max_size=50_000):
    cnt = Counter()
    for q, c in zip(questions, contexts):
        for t in nltk_tokenize(q): cnt[t] += 1
        for t in nltk_tokenize(c): cnt[t] += 1
    # keep frequent tokens
    tokens = [w for w, f in cnt.most_common() if f >= min_freq][:max_size]
    stoi = {sp:i for i, sp in enumerate(SPECIALS)}
    for w in tokens:
        if w not in stoi:
            stoi[w] = len(stoi)
    itos = {i:w for w,i in stoi.items()}
    return stoi, itos

def encode_pair(q, c, stoi, max_len=256):
    toks = nltk_tokenize(q) + ["<sep>"] + nltk_tokenize(c)
    ids = [stoi.get(t, stoi["<unk>"]) for t in toks]
    ids = ids[:max_len]
    if len(ids) < max_len:
        ids += [stoi["<pad>"]] * (max_len - len(ids))
    return np.array(ids, dtype=np.int64)

# ---------- dataset that uses question_en + context_en and inserts <sep> ----------
import torch
from torch.utils.data import Dataset

class TextClsDataset(Dataset):
    def __init__(self, df, stoi, max_len):
        self.q = df["question_en"].tolist()
        self.c = df["context_en"].tolist()
        self.y = df["label"].astype(int).values
        self.stoi = stoi
        self.max_len = max_len
    def __len__(self): return len(self.y)
    def __getitem__(self, i):
        x = torch.tensor(encode_pair(self.q[i], self.c[i], self.stoi, self.max_len), dtype=torch.long)
        y = torch.tensor(self.y[i], dtype=torch.long)
        return x, y


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
try:
    translate_series_to_en, NLLB_CODES
except NameError:
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    NLLB_MODEL_ID = "facebook/nllb-200-distilled-600M"
    NLLB_CODES = {"ar":"arb_Arab","ko":"kor_Hang","te":"tel_Telu","en":"eng_Latn"}

    _tok_mt = AutoTokenizer.from_pretrained(NLLB_MODEL_ID)
    _mt = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL_ID).to(DEVICE)

    def _translate_batch(sentences, src_code, max_new_tokens=128):
        _tok_mt.src_lang = src_code
        eng_id = _tok_mt.convert_tokens_to_ids(NLLB_CODES["en"])
        enc = _tok_mt(sentences, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
        with torch.no_grad():
            gen = _mt.generate(**enc, forced_bos_token_id=eng_id, max_new_tokens=max_new_tokens)
        out = _tok_mt.batch_decode(gen, skip_special_tokens=True)
        return [re.sub(r"\s+"," ", s).strip() for s in out]

    def translate_series_to_en(series, src_code, batch_size=16):
        out=[]
        for i in range(0, len(series), batch_size):
            chunk = ["" if x is None else str(x) for x in series.iloc[i:i+batch_size].tolist()]
            out.extend(_translate_batch(chunk, src_code))
        return pd.Series(out, index=series.index)


In [None]:
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class KimCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim=300, num_filters=128, kernel_sizes=(3,4,5), dropout=0.3, num_classes=2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.convs = nn.ModuleList([nn.Conv1d(emb_dim, num_filters, k) for k in kernel_sizes])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)

    def forward(self, input_ids):
        x = self.emb(input_ids)       # (B, L, E)
        x = x.transpose(1, 2)         # (B, E, L)
        hs = [torch.relu(conv(x)) for conv in self.convs]
        pools = [torch.max(h, dim=2).values for h in hs]  # global max-pool
        h = torch.cat(pools, dim=1)
        h = self.dropout(h)
        return self.fc(h)

def train_cnn(train_ds, val_ds, vocab_size, class_weights=None, epochs=6, lr=3e-4):
    model = KimCNN(vocab_size=vocab_size, emb_dim=300, num_filters=128, kernel_sizes=(3,4,5), dropout=0.3).to(DEVICE)
    train_loader = DataLoader(train_ds, batch_size=64, shuffle=True, num_workers=2)
    val_loader   = DataLoader(val_ds,   batch_size=128, shuffle=False, num_workers=2)

    weight = torch.tensor(class_weights, dtype=torch.float).to(DEVICE) if class_weights is not None else None
    criterion = nn.CrossEntropyLoss(weight=weight)
    optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-2)

    best = {"f1": -1, "state": None, "y_true": None, "y_prob": None}
    for ep in range(1, epochs+1):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            optim.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()

        # eval
        model.eval()
        probs, labels = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(DEVICE)
                logits = model(xb)
                p = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
                probs.append(p); labels.append(yb.numpy())
        y_prob = np.concatenate(probs); y_true = np.concatenate(labels)

        # pick threshold by Macro-F1 for model selection
        t = sweep_thresholds(y_true, y_prob, mode="f1")
        f1m = f1_score(y_true, (y_prob >= t).astype(int), average="macro")
        if f1m > best["f1"]:
            best = {"f1": f1m, "state": model.state_dict(), "y_true": y_true, "y_prob": y_prob}

    model.load_state_dict(best["state"])
    return model, best["y_true"], best["y_prob"]


In [None]:
results_cnn = []

for lang in LANGS:
    print("\n" + "="*10 + f" {lang.upper()} " + "="*10)
    # filter
    tr = df_train[df_train["lang"]==lang].copy()
    va = df_val[df_val["lang"]==lang].copy()
    if tr.empty or va.empty:
        print(f"[WARN] no data for {lang}, skipping.")
        continue

    # labels
    tr["label"] = tr["answerable"].astype(int)
    va["label"] = va["answerable"].astype(int)

    # context already EN
    tr["context_en"] = tr["context"].astype(str).apply(normalize_text)
    va["context_en"] = va["context"].astype(str).apply(normalize_text)

    # ensure translated question_en exists; translate if missing
    if "question_en" not in tr.columns or tr["question_en"].isna().any():
        src = NLLB_CODES[lang]
        tr["question_en"] = translate_series_to_en(tr["question"].astype(str), src)
        va["question_en"] = translate_series_to_en(va["question"].astype(str), src)
    # build vocab on training questions+contexts (not concatenated text)
    stoi, itos = build_vocab_pairs(tr["question_en"], tr["context_en"], min_freq=2, max_size=50_000)

    # datasets (now use the new TextClsDataset that encodes pairs and inserts <sep>)
    ds_tr = TextClsDataset(tr, stoi, MAX_TOKENS)
    ds_va = TextClsDataset(va, stoi, MAX_TOKENS)


    # class weights to handle imbalance
    y = tr["label"].values
    n0, n1 = (y==0).sum(), (y==1).sum()
    N=max(n0+n1,1)
    class_w = [N/(2*max(n0,1)), N/(2*max(n1,1))]

    # train
    model, y_true, y_prob = train_cnn(ds_tr, ds_va, vocab_size=len(stoi), class_weights=class_w, epochs=6, lr=3e-4)

    # Evaluate at two thresholds
    t_f1 = sweep_thresholds(y_true, y_prob, mode="f1")
    acc, tpr, fpr, tnr, f1m = metrics_at_threshold(y_true, y_prob, t_f1)

    t_y  = sweep_thresholds(y_true, y_prob, mode="youden")
    acc_y, tpr_y, fpr_y, tnr_y, f1m_y = metrics_at_threshold(y_true, y_prob, t_y)

    results_cnn.append({
        "Language": lang.upper(),
        "τ(Macro-F1)": round(t_f1,2),
        "Accuracy": f"{acc*100:.1f}%", "TPR": f"{tpr*100:.1f}%", "FPR": f"{fpr*100:.1f}%", "TNR": f"{tnr*100:.1f}%", "Macro-F1": f"{f1m*100:.1f}%",
        "τ(Youden)": round(t_y,2),
        "Accuracy_Y": f"{acc_y*100:.1f}%", "TPR_Y": f"{tpr_y*100:.1f}%", "FPR_Y": f"{fpr_y*100:.1f}%", "TNR_Y": f"{tnr_y*100:.1f}%", "Macro-F1_Y": f"{f1m_y*100:.1f}%"
    })

df_cnn = pd.DataFrame(results_cnn)
display(df_cnn)







Unnamed: 0,Language,τ(Macro-F1),Accuracy,TPR,FPR,TNR,Macro-F1,τ(Youden),Accuracy_Y,TPR_Y,FPR_Y,TNR_Y,Macro-F1_Y
0,AR,0.8,91.6%,93.9%,25.0%,75.0%,82.1%,0.95,76.6%,73.8%,3.8%,96.2%,67.7%
1,KO,0.7,94.9%,98.2%,63.2%,36.8%,70.6%,0.9,79.2%,80.1%,36.8%,63.2%,56.2%
2,TE,0.95,80.2%,85.9%,37.6%,62.4%,73.6%,0.95,80.2%,85.9%,37.6%,62.4%,73.6%
