### Week 39


In [None]:
from datasets import load_dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = dataset["train"].to_pandas()
df_val = dataset["validation"].to_pandas()

df_train_te = df_train[df_train["lang"].str.lower() == "te"]
df_val_te   = df_val[df_val["lang"].str.lower() == "te"]

#print(df_train_te.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Telugu question and English -> Telugu answer

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import re, html


def get_inlang(v):
    if v is None: return ""
    if isinstance(v, dict):
        t = v.get("text", "")
        if isinstance(t, list): return t[0] if t else ""
        return t if isinstance(t, str) else ""
    if isinstance(v, list): return v[0] if v else ""
    if isinstance(v, str):  return v
    return ""

def clean_text(s: str) -> str:
    s = html.unescape(str(s))
    s = re.sub(r"<br\s*/?>", " ", s, flags=re.I)
    s = re.sub(r"<[^>]+>", " ", s)             # remove any HTML tags
    s = re.sub(r"\s+", " ", s).strip()
    return s
def prep(df):
    out = df.copy()
    out["question_norm"] = df["question"].astype(str).str.strip().apply(clean_text)
    out["context_norm"]  = df["context"].astype(str).str.strip().apply(clean_text)

    # your existing get_inlang(...) stays the same; produces 'answer_te'
    out["answer_te"]     = df["answer_inlang"].apply(get_inlang).astype(str).str.strip()

    mask = (out["question_norm"] != "") & (out["context_norm"] != "") & (out["answer_te"] != "")
    out = out[mask]

    # 🔁 REPLACE your old prompt with this VERY PLAIN one (prevents copying prompt)
    out["input_text"]  = "Question: " + out["question_norm"] + " Context: " + out["context_norm"]
    out["target_text"] = out["answer_te"]

    return out[["input_text","target_text"]]
train_proc = prep(df_train_te)
val_proc   = prep(df_val_te)

print(f"Train usable: {len(train_proc)}  |  Val usable: {len(val_proc)}")
print(train_proc.head(2))

data_te = DatasetDict({
    "train": Dataset.from_pandas(train_proc, preserve_index=False),
    "validation": Dataset.from_pandas(val_proc, preserve_index=False),
})


Train usable: 50  |  Val usable: 100
                                              input_text target_text
15276  Question: 1990 నాటికి ఆఫ్రికాలో అతిపెద్ద జనాభా...    నైజీరియా
15277  Question: 2010 నాటికీ వ్యవసాయ రంగంలో చైనా దేశం...       ప్రధమ


In [None]:
# deps
!pip -q install "transformers>=4.42" "accelerate>=0.33" evaluate sacrebleu sentencepiece peft

import numpy as np, torch
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
)
import evaluate
from peft import LoraConfig, get_peft_model, TaskType

assert len(data_te["train"])>0 and len(data_te["validation"])>0, "No usable data after filtering."

# ---------------- config ----------------
MODEL = "google/byt5-small"
MAX_SRC = 768     # byte-level → longer inputs OK
MAX_TGT = 96
LR      = 1e-4
EPOCHS  = 12
BATCH   = 4
USE_LORA = True   # set False to fine-tune full model

# -------------- tokenizer ---------------
tok = AutoTokenizer.from_pretrained(MODEL)

# Build a small ban list so generations don't copy the prompt words
ban_strings = ["Question:", "Context:", "question:", "context:", "question", "context", "qa", "QA"]
bad_words_ids = []
for s in ban_strings:
    ids = tok.encode(s, add_special_tokens=False)
    if ids:  # only keep non-empty encodings
        bad_words_ids.append(ids)


def tokenize_batch(batch):
    # robust labels via text_target (works on recent Transformers)
    return tok(batch["input_text"], text_target=batch["target_text"],
               truncation=True, max_length=MAX_SRC)

tokd = data_te.map(tokenize_batch, batched=True, remove_columns=["input_text","target_text"])

# --------------- model ------------------
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

if USE_LORA:
    peft_cfg = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=16, lora_alpha=32, lora_dropout=0.1,
        target_modules=["q","k","v","o"]  # T5/ByT5 attention proj names
    )
    model = get_peft_model(model, peft_cfg)
    model.print_trainable_parameters()

collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model, label_pad_token_id=-100)

# -------------- metrics -----------------
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

def compute_metrics(eval_pred):
    pred_ids, label_ids = eval_pred
    # decode
    label_ids = np.where(label_ids == -100, tok.pad_token_id, label_ids)
    preds = tok.batch_decode(pred_ids, skip_special_tokens=True)
    refs  = tok.batch_decode(label_ids, skip_special_tokens=True)
    # sacrebleu expects list[str], list[list[str]]
    return {
        "bleu": bleu.compute(predictions=preds, references=[[r] for r in refs])["score"],
        "chrf": chrf.compute(predictions=preds, references=[[r] for r in refs])["score"],
    }

# ---------- training args (stable) ------
args = Seq2SeqTrainingArguments(
    output_dir="byt5_te_q_en_ctx_te_ans",
    learning_rate=LR,
    label_smoothing_factor=0.1,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    num_train_epochs=EPOCHS,
    fp16=False,                         # keep off for stability on tiny data
    logging_steps=5,
    save_strategy="no",
    predict_with_generate=True,
    generation_max_length=MAX_TGT,
    report_to="none",
    remove_unused_columns=False,        # IMPORTANT: keep 'labels'
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokd["train"],
    eval_dataset=tokd["validation"],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

# stronger decoding to avoid empty strings at eval
trainer.args.generation_num_beams = 5

# ---- sanity: verify a finite batch loss before training
from torch.utils.data import DataLoader
dl = DataLoader(tokd["train"], batch_size=2, shuffle=True, collate_fn=collator)
batch = next(iter(dl))
batch = {k: v.to(model.device) for k, v in batch.items()}
with torch.no_grad():
    out = model(**batch)
print("Manual batch loss:", float(out.loss))



trainer.train()


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 2,375,680 || all params: 302,013,440 || trainable%: 0.7866


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Seq2SeqTrainer(


Manual batch loss: 1.395450234413147


Step,Training Loss
5,5.2788
10,4.6123
15,4.5656
20,4.6344
25,4.6594
30,4.9584
35,4.4942
40,4.6622
45,4.404
50,4.9931


TrainOutput(global_step=156, training_loss=4.291945101358952, metrics={'train_runtime': 177.4106, 'train_samples_per_second': 3.382, 'train_steps_per_second': 0.879, 'total_flos': 833442899558400.0, 'train_loss': 4.291945101358952, 'epoch': 12.0})

In [None]:

gen_kwargs = dict(
    num_beams=8,
    max_new_tokens=MAX_TGT,
    no_repeat_ngram_size=3,
    repetition_penalty=1.2,
)

# only add bad_words_ids if we actually have some
if bad_words_ids:
    gen_kwargs["bad_words_ids"] = bad_words_ids

metrics = trainer.evaluate(**gen_kwargs)
print(metrics)

out = trainer.predict(tokd["validation"], **gen_kwargs)
import numpy as np
preds = tok.batch_decode(out.predictions, skip_special_tokens=True)
labels = np.where(out.label_ids == -100, tok.pad_token_id, out.label_ids)
refs  = tok.batch_decode(labels, skip_special_tokens=True)

for i in range(min(10, len(preds))):
    print(f"[{i}] PRED: {preds[i]!r}\n    GOLD: {refs[i]!r}\n")


# Inference helper (Telugu Q + English ctx → Telugu A)
def answer_te_from_en_ctx(question_te: str, context_en: str, max_new_tokens=96) -> str:
    prompt = f"Question: {question_te} Context: {context_en}"
    enc = tok([prompt], return_tensors="pt", truncation=True, max_length=MAX_SRC).to(model.device)
    kwargs = dict(num_beams=8, max_new_tokens=max_new_tokens, no_repeat_ngram_size=3, repetition_penalty=1.2)
    if bad_words_ids:
        kwargs["bad_words_ids"] = bad_words_ids
    gen = model.generate(**enc, **kwargs)
    return tok.decode(gen[0], skip_special_tokens=True)


# Example:
# print(answer_te_from_en_ctx("1990 నాటికి ఆఫ్రికాలో అతిపెద్ద జనాభా కలిగిన దేశం ఏది?",
#                             "Africa ... Nigeria is its largest by population ..."))


{'eval_loss': 4.411931991577148, 'eval_bleu': 0.0451490098760212, 'eval_chrf': 2.752218687085018, 'eval_runtime': 102.1601, 'eval_samples_per_second': 0.979, 'eval_steps_per_second': 0.245, 'epoch': 12.0}
[0] PRED: 'మలేరియా వ్௯squitoes. Malaria is a disease spread when the parasite enters'
    GOLD: 'హన్స్ ఆండర్సాగ్'

[1] PRED: 'నా చిత్రీకుడైపోయలేదంటూ. Munna is a 2007 Telugu movie '
    GOLD: 'హరీష్ జైరాజ్'

[2] PRED: 'లోకి వస్తుందా? Answer: In 1858, the British government began directly g'
    GOLD: '1608'

[3] PRED: 'లో ప్రవేశించుకొనడా? Answer: Ugadi is celebrated as a sign of '
    GOLD: 'మార్చి లేదా ఏప్రిల్'

[4] PRED: ' (landlocked) Maritime claims: 0 km "border countries:" Cameroon 1,420 m Natural resources\nAnsw'
    GOLD: 'కొలంబియా మాదిరిగా అదే పరిమాణం'

[5] PRED: 'యార్కు జనసందేహమీడిపోలైటైఫొబృచౌత'
    GOLD: '28,491'

[6] PRED: ' Pakistan is one of the leading Hindu spiritual centers in 1947. Answer: పాకిస్త'
    GOLD: '1947'

[7] PRED: ' and a daughter Gopalareddy is also work

### Telegu question -> Telegu answer



In [None]:
# Q-only prep (reuses get_inlang / clean_text from before)
def prep_q_only(df):
    out = df.copy()
    out["question_norm"] = df["question"].astype(str).str.strip().apply(clean_text)
    out["answer_te"]     = df["answer_inlang"].apply(get_inlang).astype(str).str.strip()

    # keep examples that have a question + in-language answer
    mask = (out["question_norm"] != "") & (out["answer_te"] != "")
    out = out[mask]

    # short, neutral prompt (no context)
    out["input_text"]  = "తెలుగులో సంక్షిప్త సమాధానం ఇవ్వండి. Question: " + out["question_norm"]
    out["target_text"] = out["answer_te"]
    return out[["input_text","target_text"]]

train_q = prep_q_only(df_train_te)
val_q   = prep_q_only(df_val_te)

from datasets import Dataset, DatasetDict
data_te_qonly = DatasetDict({
    "train": Dataset.from_pandas(train_q, preserve_index=False),
    "validation": Dataset.from_pandas(val_q, preserve_index=False),
})
print(len(train_q), len(val_q))


50 100


In [None]:
# tokenize
tokd_q = data_te_qonly.map(
    lambda b: tok(b["input_text"], text_target=b["target_text"], truncation=True, max_length=MAX_SRC),
    batched=True, remove_columns=["input_text","target_text"]
)

# train (you can reuse the same model/args/collator)
trainer.train_dataset = tokd_q["train"]
trainer.eval_dataset  = tokd_q["validation"]

# optional: decoding constraints are not really needed now (no context to copy),
# but you can keep beams if you like:
trainer.args.generation_num_beams = 5

trainer.train()


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Step,Training Loss
5,4.4232
10,3.4552
15,3.5326
20,3.4424
25,3.4808
30,3.4436
35,3.3385
40,3.2654
45,3.1452
50,3.4993


TrainOutput(global_step=156, training_loss=3.048918965535286, metrics={'train_runtime': 62.4337, 'train_samples_per_second': 9.61, 'train_steps_per_second': 2.499, 'total_flos': 288919637099520.0, 'train_loss': 3.048918965535286, 'epoch': 12.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)

out = trainer.predict(tokd_q["validation"])
import numpy as np
preds = tok.batch_decode(out.predictions, skip_special_tokens=True)
labels = np.where(out.label_ids == -100, tok.pad_token_id, out.label_ids)
refs  = tok.batch_decode(labels, skip_special_tokens=True)
for i in range(min(10, len(preds))):
    print(f"[{i}] PRED: {preds[i]!r}\n    GOLD: {refs[i]!r}\n")

def answer_te(question_te: str, max_new_tokens=32) -> str:
    prompt = f"తెలుగులో సంక్షిప్త సమాధానం ఇవ్వండి. Question: {question_te}"
    enc = tok([prompt], return_tensors="pt", truncation=True, max_length=MAX_SRC).to(model.device)
    gen = model.generate(**enc, num_beams=5, max_new_tokens=max_new_tokens)
    return tok.decode(gen[0], skip_special_tokens=True)


{'eval_loss': 2.9677209854125977, 'eval_bleu': 0.16900453727161294, 'eval_chrf': 7.043303917284298, 'eval_runtime': 41.6565, 'eval_samples_per_second': 2.401, 'eval_steps_per_second': 0.6, 'epoch': 12.0}
[0] PRED: 'రియా వ్యాధికి మందుకి మందు కనిపెట్ట'
    GOLD: 'హన్స్ ఆండర్సాగ్'

[1] PRED: 'ర్యాప్యండి. మున్నా చిత్రానికి సంక్'
    GOLD: 'హరీష్ జైరాజ్'

[2] PRED: 'లోకి వచ్చింది. కంపెనీలోకి వచ్చింది'
    GOLD: '1608'

[3] PRED: 'ర్లీష్లో సంక్షిప్త సమాధానం ఇవ్వండ'
    GOLD: 'మార్చి లేదా ఏప్రిల్'

[4] PRED: 'ర్యాక్రిక్ కార్మిక్ కార్మిక్ కార్'
    GOLD: 'కొలంబియా మాదిరిగా అదే పరిమాణం'

[5] PRED: 'ర్యాక్స్ న్యూయార్క్ న్యూయార్క్ లో'
    GOLD: '28,491'

[6] PRED: 'రిక్కండి. పాకిస్తాన్ కు సంక్షిప్తం'
    GOLD: '1947'

[7] PRED: 'ల్లిదండ్రులకు ప్రాయంల్లాండ్రుల్ల'
    GOLD: 'వెంకటేశ్వర్లు, తల్లి మహాలక్షమ్మ'

[8] PRED: 'రిల్డిక్ లోకంలోకి ప్రాకంలోకి ప్రా'
    GOLD: '87,000'

[9] PRED: 'ర్యాక్ అన్నారి. నేను చిత్రంగా చిత్'
    GOLD: 'డివివి దానయ్య'



### Telegu Question -> English answer

In [None]:
# Q-only, target = EN answer
import re, html
from datasets import Dataset, DatasetDict

def clean_text(s: str) -> str:
    s = html.unescape(str(s))
    s = re.sub(r"<br\s*/?>", " ", s, flags=re.I)
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def get_answer_en(v):
    if v is None: return ""
    if isinstance(v, dict):
        t = v.get("text", "")
        if isinstance(t, list): return t[0] if t else ""
        return t if isinstance(t, str) else ""
    if isinstance(v, list): return v[0] if v else ""
    if isinstance(v, str):  return v
    return ""

def prep_q_to_en(df):
    out = df.copy()
    out["question_norm"] = df["question"].astype(str).str.strip().apply(clean_text)
    out["answer_en"]     = df["answer"].apply(get_answer_en).astype(str).str.strip()
    mask = (out["question_norm"] != "") & (out["answer_en"] != "")
    out = out[mask]
    # minimal prompt to avoid copying
    out["input_text"]  = "Answer briefly in English. Question (Telugu): " + out["question_norm"]
    out["target_text"] = out["answer_en"]
    return out[["input_text","target_text"]]

train_q2en = prep_q_to_en(df_train_te)
val_q2en   = prep_q_to_en(df_val_te)

data_te_q2en = DatasetDict({
    "train": Dataset.from_pandas(train_q2en, preserve_index=False),
    "validation": Dataset.from_pandas(val_q2en, preserve_index=False),
})
print(len(train_q2en), len(val_q2en))
train_q2en.head(2)


1355 384


Unnamed: 0,input_text,target_text
13771,Answer briefly in English. Question (Telugu): ...,London
13772,Answer briefly in English. Question (Telugu): ...,Jawaharlal Nehru


In [None]:
# Tokenize (reuse tok/model/trainer/collator from Case 2)
tokd_q2en = data_te_q2en.map(
    lambda b: tok(b["input_text"], text_target=b["target_text"], truncation=True, max_length=MAX_SRC),
    batched=True, remove_columns=["input_text","target_text"]
)

trainer.train_dataset = tokd_q2en["train"]
trainer.eval_dataset  = tokd_q2en["validation"]

# Short answers; safe decoding defaults
trainer.args.generation_num_beams = 8
trainer.args.generation_max_length = 16   # English names/numbers are short

trainer.train()


Map:   0%|          | 0/1355 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Step,Training Loss
5,4.423
10,4.925
15,4.6479
20,4.5818
25,4.568
30,4.4402
35,4.204
40,3.9552
45,4.1759
50,4.0798


TrainOutput(global_step=4068, training_loss=3.0613298545075964, metrics={'train_runtime': 1197.3774, 'train_samples_per_second': 13.58, 'train_steps_per_second': 3.397, 'total_flos': 5787703861883904.0, 'train_loss': 3.0613298545075964, 'epoch': 12.0})

In [None]:
# Evaluate
metrics = trainer.evaluate()
print(metrics)

# Preview
out = trainer.predict(tokd_q2en["validation"])
import numpy as np
preds = tok.batch_decode(out.predictions, skip_special_tokens=True)
labels = np.where(out.label_ids == -100, tok.pad_token_id, out.label_ids)
refs  = tok.batch_decode(labels, skip_special_tokens=True)
for i in range(min(10, len(preds))):
    print(f"[{i}] PRED: {preds[i]!r}\n    GOLD: {refs[i]!r}\n")

# Inference helper (Telugu Q -> English A)
def answer_en_from_te(question_te: str, max_new_tokens=16) -> str:
    prompt = f"Answer briefly in English. Question (Telugu): {question_te}"
    enc = tok([prompt], return_tensors="pt", truncation=True, max_length=MAX_SRC).to(model.device)
    gen = model.generate(**enc, num_beams=8, max_new_tokens=max_new_tokens)
    return tok.decode(gen[0], skip_special_tokens=True)

# Example:
# print(answer_en_from_te("మలేరియా వ్యాధి కి మందు కనిపెట్టిన శాస్త్రవేత్త ఎవరు?"))


{'eval_loss': 2.535140037536621, 'eval_bleu': 0.4121637260959798, 'eval_chrf': 4.431975086531433, 'eval_runtime': 47.5269, 'eval_samples_per_second': 8.08, 'eval_steps_per_second': 2.02, 'epoch': 12.0}
[0] PRED: 'Karnataka'
    GOLD: 'Portland'

[1] PRED: 'India'
    GOLD: 'Indian subcontinent'

[2] PRED: 'India'
    GOLD: 'England'

[3] PRED: '1999'
    GOLD: '1914'

[4] PRED: '1999'
    GOLD: '28 July 1914'

[5] PRED: '2005'
    GOLD: 'India'

[6] PRED: 'India and India'
    GOLD: '122'

[7] PRED: 'the most of the'
    GOLD: 'approximately 5 liters'

[8] PRED: 'National Govern'
    GOLD: 'JPMorgan Chase Tower'

[9] PRED: 'India and India'
    GOLD: '18'

