In [1]:
## IMPORTS
from collections import Counter
from googletrans import Translator
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from typing import List, Iterable, Dict, Optional
from unidecode import unidecode
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    pipeline,
    AutoModelForQuestionAnswering,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    T5Tokenizer,
    MT5ForConditionalGeneration)

import evaluate
import sacrebleu
import rouge_score
import torch
import pickle
import string
import nltk
import numpy as np 
import pandas as pd
import polars as pl
import regex as re

In [2]:
langs  = ["ko"]
lang = langs[0]
splits = {'train': 'train.parquet', 'validation': 'validation.parquet', 'test': 'test.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val   = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
df_test = pd.read_json('week41qa.json') # otherwise 'week41qa - simple.json'
 
df_train = df_train[df_train.lang.isin(langs)].reset_index(drop=True)
df_val   = df_val[df_val.lang.isin(langs)].reset_index(drop=True)


df_train = pd.concat((df_train,df_val))
train_ds = Dataset.from_pandas(df_train[["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)
test_ds  = Dataset.from_pandas(df_test[ ["lang","question","context","answerable","answer_start","answer"]], preserve_index=False)

### Week 36 (Part 1)

In [None]:
## Each language total words (not counting punctuation)
# tokenizer: split on \W+ (non-word chars); protect hyphens between letters/digits
# safeguard: build punctuation set from training+validation data, do not count these tokens as well
ko_train_q = df_train[df_train["lang"] == lang]["question"].astype(str)
ko_test_q   = df_test[df_test["lang"] == lang]["question"].astype(str)

PUNCT_RE = re.compile(r"\p{P}", re.UNICODE)
SPLIT_RE = re.compile(r"\W+", re.UNICODE)          # tokenizer


ko_train_punct = Counter(ch for q in ko_train_q for ch in PUNCT_RE.findall(q))
ko_test_punct   = Counter(ch for q in ko_test_q for ch in PUNCT_RE.findall(q))

print("Korean — TRAIN punctuation (char -> count):")
print(ko_train_punct.most_common())
print("Korean — TEST punctuation (char -> count):")
print(ko_test_punct.most_common())

HY = "HYPHENJOIN" # placeholder for protected hyphens
PROTECT_HYPHEN = re.compile(r"(?<=[\p{L}\p{N}])-(?=[\p{L}\p{N}])", re.UNICODE)  # hyphen between letters/digits

# KOREAN
ko_train_q = df_train[df_train["lang"] == lang]["question"].astype(str)
ko_test_q   = df_test[df_test["lang"] == lang]["question"].astype(str)

ko_punct_set = set(ch for q in pd.concat([ko_train_q, ko_test_q]) for ch in PUNCT_RE.findall(q))

ko_train_tokens = []
for q in ko_train_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ko_punct_set]
    ko_train_tokens.extend(toks)

ko_test_tokens = []
for q in ko_test_q:
    q2 = PROTECT_HYPHEN.sub(HY, q)
    toks = [t.replace(HY, "-") for t in SPLIT_RE.split(q2) if t and t not in ko_punct_set]
    ko_test_tokens.extend(toks)

print()
print("Korean — TRAIN total words:", len(ko_train_tokens))
print("Korean — TEST total words:",   len(ko_test_tokens))


ko_numbers_train = sum(1 for t in ko_train_tokens if t.isdigit())
ko_numbers_test   = sum(1 for t in ko_test_tokens if t.isdigit())

ko_hyphen_train = sum(1 for t in ko_train_tokens if "-" in t)
ko_hyphen_test   = sum(1 for t in ko_test_tokens if "-" in t)

print()
print("Korean — numeric tokens (train):", ko_numbers_train)
print("Korean — numeric tokens (test):",   ko_numbers_test)
print()
print("Korean — hyphenated tokens (train):", ko_hyphen_train)
print("Korean — hyphenated tokens (test):",   ko_hyphen_test)

Korean — TRAIN punctuation (char -> count):
[('?', 2776), (',', 26), ('.', 25), ('-', 6), ("'", 6), ('"', 6), (':', 2), ('/', 1), ('\\', 1), ('(', 1), (')', 1)]
Korean — TEST punctuation (char -> count):
[('?', 30), ('《', 1), ('》', 1)]

Korean — TRAIN total words: 13594
Korean — TEST total words: 131

Korean — numeric tokens (train): 10
Korean — numeric tokens (test): 0

Korean — hyphenated tokens (train): 6
Korean — hyphenated tokens (test): 0


In [4]:
# Stats about answerable vs unanswerable questions

# Define languages and splits

split_dfs = {
    "train": df_train,
    "test":  df_test
}


rows = []
for split_name, df in split_dfs.items():
    for lang in langs:
        total = df[df["lang"] == lang].shape[0]
        ans   = df[(df["lang"] == lang) & (df["answerable"])].shape[0]
        unans = total - ans
        ratio = ans / total if total > 0 else 0
        rows.append([split_name, lang, total, ans, unans, ratio])

# Create summary DataFrame
summary = pd.DataFrame(rows, columns=["Split", "Language", "Total", "Answerable", "Unanswerable", "Answerable Ratio"])
print(summary.to_string(index=False))


Split Language  Total  Answerable  Unanswerable  Answerable Ratio
train       ko   2778        2696            82          0.970482
 test       ko     30          29             1          0.966667


#### Week 36 - Rule-based classifier

In [5]:
def pick_cols(df, translate_contexts=False):
    q_col = 'question_en' if 'question_en' in df.columns else 'question'
    if translate_contexts and 'context_en' in df.columns:
        c_col = 'context_en'
    else:
        c_col = 'context'
    return q_col, c_col


def tokenize(text: str):
    tokens = re.split(r'\W+', str(text) if text is not None else "")
    return [t.lower() for t in tokens if t and t.lower() not in STOP_WORDS]

def overlap_score_question(question: str, context: str):
    q_toks = tokenize(question)
    c_toks = tokenize(context)
    if not q_toks:
        return 0.0, 0
    matched = set()
    for q in q_toks:
        for c in c_toks:
            if q == c or (q in c) or (c in q):
                matched.add(q)
                break
    matches = len(matched)
    ratio = matches / max(1, len(q_toks))
    return ratio, matches

def tune_parameters(train_df, q_col, c_col,
                    match_grid=(1,2,3,4,5,6,7,8,9,10),
                    thr_grid=(0.3,0.4,0.5,0.6,0.7,0.8,0.9)):
    data = [(overlap_score_question(getattr(r, q_col), getattr(r, c_col)), int(r.answerable))
            for r in train_df.itertuples(index=False)]
    best_acc, best_k, best_thr = 0.0, 1, 0.5
    for k in match_grid:
        for thr in thr_grid:
            correct = 0
            for (ratio, m), y in data:
                pred = int((m >= k) and (ratio >= thr))
                correct += (pred == y)
            acc = correct / len(data) if data else 0.0
            if acc > best_acc:
                best_acc, best_k, best_thr = acc, k, thr
    return {"min_match_count": best_k, "min_ratio_threshold": best_thr, "best_train_acc": best_acc}


def eval_metrics(df, q_col, c_col, min_matches, ratio_threshold):
    y_true, y_pred = [], []
    for r in df.itertuples(index=False):
        ratio, m = overlap_score_question(getattr(r, q_col), getattr(r, c_col))
        y_true.append(int(r.answerable))
        y_pred.append(int((m >= min_matches) and (ratio >= ratio_threshold)))
    y_true = np.asarray(y_true, int)
    y_pred = np.asarray(y_pred, int)

    tp = int(((y_pred==1) & (y_true==1)).sum())
    fp = int(((y_pred==1) & (y_true==0)).sum())
    fn = int(((y_pred==0) & (y_true==1)).sum())
    tn = int(((y_pred==0) & (y_true==0)).sum())

    acc  = (tp+tn)/max(1, tp+tn+fp+fn)
    prec = tp/max(1, tp+fp)
    rec  = tp/max(1, tp+fn)
    f1   = 0.0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)

    return {
        "acc": round(acc, 4),
        "prec": round(prec, 4),
        "rec": round(rec, 4),
        "f1": round(f1, 4),
        "cm": {"TP": tp, "FP": fp, "FN": fn, "TN": tn}
    }


In [None]:
### 
df_train_wk41_translated = pd.read_parquet("df_train_translated_wk41.parquet")
df_test_wk41_translated = pd.read_parquet("df_val_translated_wk41.parquet") # Even though it's called val, it's actually test

nltk.download('stopwords', quiet=True)
try:
    nltk.download('punkt', quiet=True)
except Exception:
    pass

STOP_WORDS = set(stopwords.words('english')) | set(string.punctuation)

def run_rule_classifier(df_train, df_val, translate_contexts=False):
    results = {}
    for code, name in [("ko","Korean")]:
        tr = df_train[df_train["lang"] == code].copy()
        va = df_val[df_val["lang"] == code].copy()
        if tr.empty or va.empty:
            results[name] = {
                "train_acc": None, "val_acc": None, "val_prec": None, "val_rec": None, "val_f1": None,
                "min_matches": None, "min_ratio": None, "cm": None, "n_train": len(tr), "n_val": len(va)
            }
            continue

        q_col_tr, c_col_tr = pick_cols(tr, translate_contexts=translate_contexts)
        q_col_va, c_col_va = pick_cols(va, translate_contexts=translate_contexts)

        params = tune_parameters(tr, q_col_tr, c_col_tr)
        metrics_val = eval_metrics(va, q_col_va, c_col_va,
                                   params["min_match_count"], params["min_ratio_threshold"])

        results[name] = {
            "n_train": len(tr),
            "n_val": len(va),
            "train_acc": round(params["best_train_acc"], 4),
            "val_acc": metrics_val["acc"],
            "val_prec": metrics_val["prec"],
            "val_rec": metrics_val["rec"],
            "val_f1": metrics_val["f1"],
            "min_matches": params["min_match_count"],
            "min_ratio": params["min_ratio_threshold"],
            "cm": metrics_val["cm"],
        }

 
    summary = pd.DataFrame({
        lang: {k:v for k,v in res.items() if k not in ("cm",)}
        for lang, res in results.items()
    }).T
    print(summary.to_string())


    print("\nConfusion matrices:")
    for lang, res in results.items():
        print(f"{lang}: {res['cm']}")

    return results


_ = run_rule_classifier(df_train_wk41_translated, df_test_wk41_translated, translate_contexts=False)


        n_train  n_val  train_acc  val_acc  val_prec  val_rec  val_f1  min_matches  min_ratio
Korean   2778.0   30.0     0.8312   0.9333       1.0    0.931  0.9643          1.0        0.3

Confusion matrices:
Korean: {'TP': 27, 'FP': 0, 'FN': 2, 'TN': 1}


### Week 38 (Part 3)

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"

def to_numpy_labels(ds):
    return np.array([1 if bool(x) else 0 for x in ds["answerable"]], dtype=np.int64)

def label_stats(y, name=""):
    p = y.mean()
    print(f"[{name}] n={len(y)}  positives={y.sum()} ({p:.3f})  negatives={(1-p):.3f}")

def metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    acc = accuracy_score(y_true, y_pred)
    tpr = tp/(tp+fn) if (tp+fn)>0 else 0.0
    fpr = fp/(fp+tn) if (fp+tn)>0 else 0.0


    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = (2 * tp) / (2*tp + fp + fn)
    
    return acc, tpr, fpr, precision, recall, f1

def pick_threshold(y_true, y_score, goal="f1"):
 
    if goal == "f1":
        best_t, best_f1 = 0.5, -1.0
        for t in np.linspace(0.0, 1.0, 101):
            y_pred = (y_score >= t).astype(int)
            f1 = f1_score(y_true, y_pred, zero_division=0)
            if f1 > best_f1:
                best_f1, best_t = f1, t
        return best_t

    fpr, tpr, thr = roc_curve(y_true, y_score)
    mask = tpr >= 0.90
    return (thr[mask][np.argmin(fpr[mask])]) if mask.any() else 0.5


In [None]:

tok  = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
bert = AutoModel.from_pretrained("distilbert-base-multilingual-cased").to(device)
bert.eval()
print("hidden_size from config:", bert.config.hidden_size)

@torch.no_grad()
def mean_embed(texts, batch_size=16, max_length=256):
    vecs = []
    for i in range(0, len(texts), batch_size):
        enc = tok(texts[i:i+batch_size], padding=True, truncation=True,
                  max_length=max_length, return_tensors="pt").to(device)
        hs   = bert(**enc).last_hidden_state               
        mask = enc.attention_mask.unsqueeze(-1)             
        mean = (hs * mask).sum(1) / mask.sum(1).clamp(min=1) 
        vecs.append(mean.cpu())
    return torch.cat(vecs, 0).numpy()

def emb_features(ds_lang):
    q = mean_embed(ds_lang["question"])   
    c = mean_embed(ds_lang["context"])    
    feats = np.concatenate([q, c], axis=1) 
    return feats


hidden_size from config: 768


In [None]:

class FFN(torch.nn.Module):
    def __init__(self, d=1536, h=128):
        super().__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(d, h), torch.nn.ReLU(), torch.nn.Linear(h, 1)
        )
    def forward(self, x): return self.net(x).squeeze(-1)

def train_ffn_get_scores(Xtr, ytr, Xva, epochs=6, lr=1e-3, bs=64):
    Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
    ytr_t = torch.tensor(ytr, dtype=torch.float32)
    net   = FFN(d=Xtr.shape[1]).to(device)

    n_pos, n_neg = ytr.sum(), len(ytr) - ytr.sum()
    pos_w = torch.tensor([ (n_neg / max(1, n_pos)) ], dtype=torch.float32).to(device)
    lossf = torch.nn.BCEWithLogitsLoss(pos_weight=pos_w)
    opt   = torch.optim.Adam(net.parameters(), lr=lr)

    net.train()
    for _ in range(epochs):
        idx = torch.randperm(len(Xtr_t))
        for i in range(0, len(Xtr_t), bs):
            b = idx[i:i+bs]
            xb, yb = Xtr_t[b].to(device), ytr_t[b].to(device)
            opt.zero_grad(); loss = lossf(net(xb), yb); loss.backward(); opt.step()

    net.eval(); scores = []
    with torch.no_grad():
        for i in range(0, len(Xva), 2048):
            xb = torch.tensor(Xva[i:i+2048], dtype=torch.float32).to(device)
            scores.append(torch.sigmoid(net(xb)).cpu().numpy())
    return np.concatenate(scores)


def rf_get_scores(Xtr, ytr, Xva):
    rf = RandomForestClassifier(n_estimators=300, max_depth=4, random_state=0, n_jobs=-1,
                               class_weight="balanced")  
    rf.fit(Xtr, ytr)
    return rf.predict_proba(Xva)[:, 1]  


def fit_bow(train_lang, val_lang, max_features=20000):
    qv = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    cv = TfidfVectorizer(max_features=max_features, ngram_range=(1,2))
    Xq_tr = qv.fit_transform(train_lang["question"])
    Xc_tr = cv.fit_transform(train_lang["context"])
    Xq_va = qv.transform(val_lang["question"])
    Xc_va = cv.transform(val_lang["context"])
    return hstack([Xq_tr, Xc_tr]), hstack([Xq_va, Xc_va])

def lr_get_scores(Xtr_bow, ytr, Xva_bow):
    lr = LogisticRegression(max_iter=10000, solver="liblinear",
                            class_weight="balanced")
    lr.fit(Xtr_bow, ytr)
    return lr.predict_proba(Xva_bow)[:, 1]

In [None]:

#Run per language (Threshold-Tuning: goal="f1")

import time
for L in langs:
    trL = train_ds.filter(lambda ex: ex["lang"] == L)
    teL = test_ds.filter( lambda ex: ex["lang"] == L)
    y_tr, y_te = to_numpy_labels(trL), to_numpy_labels(teL)
    print(f"\n=== {L} ===  train={len(trL)} test={len(teL)}")
    label_stats(y_tr, f"{L}-train"); label_stats(y_te, f"{L}-test")

  
    start1 = time.time()
    Xtr_emb = emb_features(trL)
    stop1 = time.time()
    #print(f"{stop1-start1:.4f}")
    Xte_emb = emb_features(teL)
    #print(f"{time.time()-stop1:.4f}")
    print("Emb shape:", Xtr_emb.shape, Xte_emb.shape)  

    # Model 1: FFN-MeanEmb
    start1 = time.time() 
    s_ffn = train_ffn_get_scores(Xtr_emb, y_tr, Xte_emb, epochs=20)
    thr1  = pick_threshold(y_te, s_ffn, goal="f1")
    y_ffn = (s_ffn >= thr1).astype(int)
    acc,tpr,fpr,pre,rec,f1 = metrics(y_te, y_ffn)
    stop1 = time.time()
    print(f"[MODEL 1: FFN-MeanEmb]   thr={thr1:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f} PRE={pre:.3f} REC={rec:.3f} F1={f1:.3f}, {stop1-start1:.4f}")
    
    # Model 2: RF-MeanEmb
    s_rf = rf_get_scores(Xtr_emb, y_tr, Xte_emb)
    thr2 = pick_threshold(y_te, s_rf, goal="f1")
    y_rf = (s_rf >= thr2).astype(int)
    acc,tpr,fpr,pre,rec,f1 = metrics(y_te, y_rf)
    stop2 = time.time()
    print(f"[MODEL 2: RF-MeanEmb]    thr={thr2:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f} PRE={pre:.3f} REC={rec:.3f} F1={f1:.3f}, {stop2-stop1:.4f}")
    
    # Model 3: BoW+LogReg
    Xtr_bow, Xte_bow = fit_bow(trL, teL)
    s_lr = lr_get_scores(Xtr_bow, y_tr, Xte_bow)
    thr3 = pick_threshold(y_te, s_lr, goal="f1")
    y_lr = (s_lr >= thr3).astype(int)
    acc,tpr,fpr,pre,rec,f1 = metrics(y_te, y_lr)
    print(f"[MODEL 3: BoW+LogReg]    thr={thr3:.2f}  Acc={acc:.3f}  TPR={tpr:.3f}  FPR={fpr:.3f} PRE={pre:.3f} REC={rec:.3f} F1={f1:.3f}, {time.time()-stop2:.4f}")

Filter:   0%|          | 0/2778 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30 [00:00<?, ? examples/s]


=== ko ===  train=2778 test=30
[ko-train] n=2778  positives=2696 (0.970)  negatives=0.030
[ko-test] n=30  positives=29 (0.967)  negatives=0.033
Emb shape: (2778, 1536) (30, 1536)
[MODEL 1: FFN-MeanEmb]   thr=0.00  Acc=0.967  TPR=1.000  FPR=1.000 PRE=0.967 REC=1.000 F1=0.983, 3.4765
[MODEL 2: RF-MeanEmb]    thr=0.00  Acc=0.967  TPR=1.000  FPR=1.000 PRE=0.967 REC=1.000 F1=0.983, 3.4164
[MODEL 3: BoW+LogReg]    thr=0.00  Acc=0.967  TPR=1.000  FPR=1.000 PRE=0.967 REC=1.000 F1=0.983, 1.6718


### Week 39 (Part 4)

In [11]:
translator = Translator()

translations = []
for q,a in zip(df_test["question"], df_test["answer_inlang"]):
    qte = translator.translate(q, src='ko', dest='te').text
    ate = translator.translate(a, src='ko', dest='te').text
    translations.append([qte, ate])

qna = pd.DataFrame(translations).rename(columns={0:"question_te", 1:"answer_te"})
qna.insert(1, "context", df_test["context"])
qna.insert(3, "answerable", df_test["answerable"])

In [12]:
model_name = "mt5_te_en_to_te_final"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

test_dataset = Dataset.from_pandas(qna)

def preprocess(examples):
    inputs = [
        f"telegu question: {q} english context: {c}"
        for q, c in zip(examples["question_te"], examples["context"])
    ]
    targets = examples["answer_te"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs



#tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_test = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [13]:
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

preds, refs = [], []


for i, row in qna.iterrows():
    if not isinstance(row["answer_te"], str):
        continue

    question = row["question_te"]
    context = row["context"]

    input_text = f"telugu question: {question} english context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()

    preds.append(pred)
    refs.append([row["answer_te"]])


def evaluate_subset(df_subset, label):
    preds, refs = [], []
    for i, row in df_subset.iterrows():
        if not isinstance(row["answer_te"], str):
            continue

        question = row["question_te"]
        context = row["context"]

        input_text = f"telugu question: {question} english context: {context}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model.generate(**inputs, max_length=64, do_sample=False, num_beams=4)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = re.sub(r"<extra_id_\\d+>", "", pred).strip()

        preds.append(pred)
        refs.append([row["answer_te"]])
        return {
            "Type": label,
            "BLEU": round(bleu_result["score"], 2),
            "ROUGE-1": round(rouge_result["rouge1"], 4),
            "ROUGE-2": round(rouge_result["rouge2"], 4),
            "ROUGE-L": round(rouge_result["rougeL"], 4)
        }

bleu_result = bleu.compute(predictions=preds, references=refs)
rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])


data = {
    "Language": ["Telugu"],
    "BLEU": [round(bleu_result["score"], 2)],
    "ROUGE-1": [round(rouge_result["rouge1"], 4)],
    "ROUGE-2": [round(rouge_result["rouge2"], 4)],
    "ROUGE-L": [round(rouge_result["rougeL"], 4)]
}

df_results = pd.DataFrame(data)

print(df_results)


df_ans = qna.query("answerable == True")
df_unans = qna.query("answerable == False")


results = []
results.append(evaluate_subset(df_ans, "Answerable"))
results.append(evaluate_subset(df_unans, "Unanswerable"))


df_results = pd.DataFrame(results)
print(df_results)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  Language  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0   Telugu   0.0      0.0      0.0      0.0
           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0    Answerable   0.0      0.0      0.0      0.0
1  Unanswerable   0.0      0.0      0.0      0.0


In [14]:
import evaluate
import pandas as pd
import torch
import re

# Initialize metrics
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def evaluate_subset(df_subset, label):
    """Evaluate a subset of the data (answerable/unanswerable)"""
    preds, refs = [], []
    
    for i, row in df_subset.iterrows():
        if not isinstance(row["answer_te"], str):
            continue

        question = row["question_te"]
        context = row["context"]

        input_text = f"telugu question: {question} english context: {context}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        
        outputs = model.generate(
            **inputs, 
            max_length=128,  # Increased to match your preprocessing
            do_sample=False, 
            num_beams=4,
            early_stopping=True
        )
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = re.sub(r"<extra_id_\d+>", "", pred).strip()  # Fixed regex

        preds.append(pred)
        refs.append([row["answer_te"]])  # Note: BLEU expects list of references
    
    # Handle empty subsets
    if len(preds) == 0:
        print(f"Warning: No valid samples in {label} subset")
        return {
            "Type": label,
            "BLEU": 0.0,
            "ROUGE-1": 0.0,
            "ROUGE-2": 0.0,
            "ROUGE-L": 0.0,
            "Samples": 0
        }
    
    # Compute metrics
    bleu_result = bleu.compute(predictions=preds, references=refs)
    rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])
    
    return {
        "Type": label,
        "BLEU": round(bleu_result["score"], 2),
        "ROUGE-1": round(rouge_result["rouge1"], 4),
        "ROUGE-2": round(rouge_result["rouge2"], 4),
        "ROUGE-L": round(rouge_result["rougeL"], 4),
        "Samples": len(preds)
    }

# Evaluate overall test set
print("Evaluating overall test set...")
overall_results = evaluate_subset(qna, "Overall")
print(f"Overall Results: {overall_results}")

# Evaluate subsets
print("\nEvaluating subsets...")
df_ans = qna[qna["answerable"] == True] if "answerable" in qna.columns else qna
df_unans = qna[qna["answerable"] == False] if "answerable" in qna.columns else pd.DataFrame()

results = []
results.append(overall_results)

if len(df_ans) > 0:
    results.append(evaluate_subset(df_ans, "Answerable"))
if len(df_unans) > 0:
    results.append(evaluate_subset(df_unans, "Unanswerable"))

# Create results dataframe
df_results = pd.DataFrame(results)
print("\nDetailed Results:")
print(df_results)

# Additional: Print some examples for qualitative analysis
print("\n\nSample Predictions:")
print("-" * 80)
sample_size = min(3, len(qna))
for i in range(sample_size):
    row = qna.iloc[i]
    if isinstance(row["answer_te"], str):
        question = row["question_te"]
        context = row["context"]
        
        input_text = f"telugu question: {question} english context: {context}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        
        outputs = model.generate(
            **inputs, 
            max_length=128,
            do_sample=False, 
            num_beams=4
        )
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = re.sub(r"<extra_id_\d+>", "", pred).strip()
        
        print(f"Question: {question}")
        print(f"Context: {context[:100]}...")
        print(f"Predicted: {pred}")
        print(f"Actual: {row['answer_te']}")
        print("-" * 80)

Evaluating overall test set...
Overall Results: {'Type': 'Overall', 'BLEU': 0.0, 'ROUGE-1': 0.0, 'ROUGE-2': 0.0, 'ROUGE-L': 0.0, 'Samples': 30}

Evaluating subsets...

Detailed Results:
           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L  Samples
0       Overall   0.0      0.0      0.0      0.0       30
1    Answerable   0.0      0.0      0.0      0.0       29
2  Unanswerable   0.0      0.0      0.0      0.0        1


Sample Predictions:
--------------------------------------------------------------------------------
Question: కోపెన్‌హాగన్ ఏ శతాబ్దంలో స్థాపించబడింది?
Context: Originally a Viking fishing village established in the 10th century in the vicinity of what is now G...
Predicted: 25టల్. మీ
Actual: 10వ శతాబ్దం
--------------------------------------------------------------------------------
Question: "ది నైటింగేల్" కథ ఎవరు రాశారు?
Context: 'The Nightingale' (Danish: Nattergalen) is a literary fairy tale written by Danish author Hans Chris...
Predicted: ఆటల్
Actual: హన్స్ క్రిస్టియ

In [15]:
# If you have training logs or checkpoints
import os

def check_training_artifacts(model_path):
    print("Files in model directory:")
    if os.path.exists(model_path):
        for file in os.listdir(model_path):
            print(f"  {file}")
    
    # Check if there's a trainer state
    trainer_state_path = os.path.join(model_path, "trainer_state.json")
    if os.path.exists(trainer_state_path):
        import json
        with open(trainer_state_path, 'r') as f:
            trainer_state = json.load(f)
        print(f"Training steps: {trainer_state.get('global_step', 'N/A')}")
        print(f"Best metric: {trainer_state.get('best_metric_score', 'N/A')}")

check_training_artifacts("mt5_te_en_to_te/")

Files in model directory:


In [16]:
import torch
import re


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

examples = [
    ("భారతదేశ రాజధాని ఏది?", "India's capital is New Delhi."),
    ("తాజ్ మహల్ ఎక్కడ ఉంది?", "The Taj Mahal is located in Agra, India."),
    ("భారతదేశ కరెన్సీ ఏమిటి?", "The currency of India is the Indian Rupee."),
]

for q, c in examples:
    input_text = (
        f"telugu question: {q} english context: {c}"
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = re.sub(r"<extra_id_\d+>", "", decoded).strip()
    print(f"Q: {q}\nA: {cleaned}\n")


Q: భారతదేశ రాజధాని ఏది?
A: భారత్

Q: తాజ్ మహల్ ఎక్కడ ఉంది?
A: ముద్ క మీ మీ మీ మీ మీ మీ మీ మీ మీ మీ మీ మీ మీ మీ మీ మీ

Q: భారతదేశ కరెన్సీ ఏమిటి?
A: ప్రాన్



### Week 40 (Part 5)

In [17]:
## Metrics for Token-Labeling-Head (F1 per language)

from sklearn.metrics import precision_recall_fscore_support, classification_report
import numpy as np

def compute_metrics_token(eval_pred):
    logits = eval_pred.predictions
    predictions = np.argmax(logits, axis=-1)
    labels = eval_pred.label_ids
    
    mask = labels != -100  # bool mask for non-ignored tokens
    true_labels = labels[mask]
    pred_labels = predictions[mask]
    
    # Calculate multiple metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, pred_labels, average='binary', zero_division=0
    )
    
    # Additional metrics
    accuracy = (pred_labels == true_labels).mean()
    
    # Per-class metrics
    class_report = precision_recall_fscore_support(
        true_labels, pred_labels, average=None, labels=[0, 1], zero_division=0
    )
    
    # Confusion matrix elements
    tp = np.sum((pred_labels == 1) & (true_labels == 1))
    fp = np.sum((pred_labels == 1) & (true_labels == 0))
    fn = np.sum((pred_labels == 0) & (true_labels == 1))
    tn = np.sum((pred_labels == 0) & (true_labels == 0))
    
    return {
        "f1_token": f1,
        "precision": precision,
        "recall": recall,
        #"accuracy_token": accuracy,
        #"support_token": len(true_labels),
        #"f1_class_0": class_report[2][0],  # F1 for class 0
        #"f1_class_1": class_report[2][1],  # F1 for class 1
        #"tp": tp,
        #"fp": fp,
        #"fn": fn,
        #"tn": tn,
    }

### Metrics for QA-Head - span as IO-Token-Labeling (EM/F1 per language)
def compute_metrics_qa(eval_pred):
    # pick the most likely start and end position per example
    start_logits = np.argmax(eval_pred.predictions[0], -1)
    end_logits = np.argmax(eval_pred.predictions[1], -1)
    labels = eval_pred.label_ids

    if isinstance(labels, dict):
        gold_start = labels["start_positions"]
        gold_end = labels["end_positions"]
    elif isinstance(labels, (list, tuple)) and len(labels) == 2:
        gold_start, gold_end = labels
    else:
        gold_start = labels
        gold_end = labels

    # create set of token indices
    def span_to_set(start, end):
        # cast both to scalar ints to avoid "truth value of an array is ambiguous" errors
        s = int(np.asarray(start).reshape(-1)[0])
        e = int(np.asarray(end).reshape(-1)[0])
        if s == 0 and e == 0: # unanswerable
            return set()
        return set(range(s, e + 1))

    em_list = []
    f1_list = []

    for i,j,k,l in zip(start_logits, end_logits, gold_start, gold_end):
        predicted_tokens = span_to_set(i, j)
        gold_tokens = span_to_set(k, l)
        em_list.append(int(predicted_tokens == gold_tokens))

        if not predicted_tokens and not gold_tokens:
            f1_list.append(1)
        elif not predicted_tokens or not gold_tokens:
            f1_list.append(0)
        else:
            intersection = len(predicted_tokens.intersection(gold_tokens))
            precision = intersection / len(predicted_tokens)
            recall = intersection / len(gold_tokens)
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            f1_list.append(f1)


    
    return {
        "f1_token": np.mean(f1_list),
        "precision": precision,
        "recall": recall,
        #"accuracy_token": accuracy,
        #"support_token": len(true_labels),
        #"f1_class_0": class_report[2][0],  # F1 for class 0
        #"f1_class_1": class_report[2][1],  # F1 for class 1
        #"tp": tp,
        #"fp": fp,
        #"fn": fn,
        #"tn": tn,
    }
    #{"exact_match": np.mean(em_list), "f1_token": np.mean(f1_list)}


In [18]:
def build_preprocess_token(tokenizer, max_length=384, stride=128):
    def preprocess_token(examples):
        questions = examples["question"]
        contexts = examples["context"]
        answers = examples["answer"] if examples["answerable"] else ""
        answer_starts = examples["answer_start"] if examples["answerable"] else -1

        # tokenize questions and context
        tokenized_examples = tokenizer(
            questions,
            contexts,
            truncation="only_second", # trunc only the context, not question, risk is that important part of context is trunced
            max_length=max_length,
            return_offsets_mapping=True,
            return_token_type_ids=True
        )

        sequence_ids = tokenized_examples.sequence_ids() # also here mark which tokens come from questions, context and special tokens
        offset_mapping = tokenized_examples["offset_mapping"]
        labels = np.full(len(tokenized_examples["input_ids"]), -100) # nitialize all positions with -100 so the loss ignores them
        context_token_indices = [i for i, s in enumerate(sequence_ids) if s == 1]

        if context_token_indices:
            context_start = context_token_indices[0]
            context_end = context_token_indices[-1]
            labels[context_start:context_end+1] = 0  # default O-label

            if examples["answerable"]:
                answer0 = answer_starts
                answer1 = answer_starts + len(answers)

                for i in range(context_start, context_end + 1):
                    start, end = offset_mapping[i]

                    if not (end <= answer0 or start >= answer1):
                        labels[i] = 1  # mark as 1 if answer span is overlapped

        del tokenized_examples["offset_mapping"] # offsets are no longer needed by the Trainer
        tokenized_examples["labels"] = labels.tolist()

        return tokenized_examples

    return preprocess_token

#### Prepocessing QA-Head (Start/End-Targets)
def build_preprocess(tokenizer, max_length=384):
    no_answer_index = 0
    def preprocess_qa(examples):
        questions = examples["question"]
        contexts = examples["context"]
        answers = examples["answer"] if examples["answerable"] else ""
        answer_starts = examples["answer_start"] if examples["answerable"] else -1

        tokenized_examples = tokenizer(
            questions,
            contexts,
            truncation="only_second", # trunc only the context, not question, risk is that important part of context is trunced
            max_length=max_length,
            return_offsets_mapping=True,
            return_token_type_ids=True
        )

        # unanswerable
        start = end = no_answer_index

        if examples["answerable"]:
            seq_ids = tokenized_examples.sequence_ids() # marks which tokens come from questions (0), context (1)
            offset_mapping = tokenized_examples["offset_mapping"]
            conext_token_indices = [i for i, s in enumerate(seq_ids) if s == 1]

            if len(conext_token_indices) > 0:
                context_start = conext_token_indices[0]
                context_end = conext_token_indices[-1]
                answer0 = answer_starts
                answer1 = answer_starts + len(answers)
                i = context_start

                # move i forward while the token ends before (or exactly at) the answer start
                while i <= context_end and (offset_mapping[i][0] <= answer0 and offset_mapping[i][1] <= answer0):
                    i += 1
                # step back if we overshot so that offsets[i-1] covers the answer start
                while i > context_start and offset_mapping[i-1][0] <= answer0 < offset_mapping[i-1][1]:
                    i -= 1

                # if i is within the context expand to cover the full answer span
                if context_start <= i <= context_end:
                    j = i
                    while j <= context_end and offset_mapping[j][0] < answer1:
                        j += 1
                    start = i
                    end = min(j-1, context_end) # last token that still overlaps with the answer

        tokenized_examples["start_positions"] = start
        tokenized_examples["end_positions"] = end
        tokenized_examples.pop("offset_mapping", None)

        return tokenized_examples
    return preprocess_qa


In [19]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

print(f"")

all_model_paths = ["wk40_bert-base-multilingual-cased/checkpoint-2376",
                   "wk40_distilbert-base-multilingual-cased/checkpoint-2376",
                   "wk40_xlm-roberta-base/checkpoint-2376",
]

for model_path in all_model_paths:
    print()
    print()
    print(model_path)
    print()

    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)



    
    if "roberta" in model_path:    
        # IO-Token-Labeling
        prep = build_preprocess_token(tokenizer)
        train_prep = train_ds.map(prep, remove_columns=train_ds.column_names)
        test_prep  =  test_ds.map(prep, remove_columns=test_ds.column_names)
        
        model = AutoModelForTokenClassification.from_pretrained(model_path, num_labels=2)
        compute_metrics = compute_metrics_token
        data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    
    else:
        prep = build_preprocess(tokenizer)
        train_prep = train_ds.map(prep, remove_columns=train_ds.column_names)
        test_prep  =  test_ds.map(prep, remove_columns=test_ds.column_names)
        
        model = AutoModelForQuestionAnswering.from_pretrained(model_path)
        compute_metrics = compute_metrics_qa
        data_collator = None
    
    
    
    training_args = TrainingArguments(
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        report_to="none",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_prep,
        eval_dataset=test_prep,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )
    
    
    
    ### Evaluate on the custom questions
    metrics = trainer.evaluate(eval_dataset=test_prep)
    #print(f"VAL [{lang}], {metrics}")
    tarr = [(metrics[metric]) for metric in ["eval_f1_token", "eval_precision", "eval_recall"]]
    for k,v in zip(tarr, ["eval_f1_token", "eval_precision", "eval_recall"]):
        print(v,k)





wk40_bert-base-multilingual-cased/checkpoint-2376



Map:   0%|          | 0/2778 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

  trainer = Trainer(


eval_f1_token 0.4962126068376068
eval_precision 1.0
eval_recall 1.0


wk40_distilbert-base-multilingual-cased/checkpoint-2376



Map:   0%|          | 0/2778 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

eval_f1_token 0.546984126984127
eval_precision 1.0
eval_recall 1.0


wk40_xlm-roberta-base/checkpoint-2376



Map:   0%|          | 0/2778 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

eval_f1_token 0.49696969696969695
eval_precision 0.44086021505376344
eval_recall 0.5694444444444444
