###WEEK 39

## First Model - Telegu Question + English Context -> Telegu Answer (mt5-small)

In [3]:
import os
from datasets import load_dataset

os.environ["WANDB_DISABLED"] = "true"

dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = dataset["train"].to_pandas()
df_val = dataset["validation"].to_pandas()

df_te_train = df_train[df_train["lang"] == "te"]
df_te_val = df_val[df_val["lang"] == "te"]

df_te_train_with_ans = df_te_train[df_te_train["answer_inlang"].notna()]

df_te_val_with_ans = df_te_val[df_te_val["answer_inlang"].notna()]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/6.88M [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/4.80k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15343 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3011 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4 [00:00<?, ? examples/s]

In [4]:
import pandas as pd
from datasets import Dataset
df_te_train = df_train[df_train["lang"] == "te"]
df_te_train = df_te_train[df_te_train["answer_inlang"].notna()]

df_te_val = df_val[df_val["lang"] == "te"]
df_te_val = df_te_val[df_te_val["answer_inlang"].notna()]

train_dataset = Dataset.from_pandas(df_te_train)
val_dataset = Dataset.from_pandas(df_te_val)

print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))


Train dataset size: 50
Validation dataset size: 100


In [31]:
from datasets import Dataset
from transformers import T5Tokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer





model_name =  "google/mt5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

def preprocess(examples):

    inputs = [
        f"telegu question: {q} english context: {c}"
        for q, c in zip(examples["question"], examples["context"])
    ]
    targets = examples["answer_inlang"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs



tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_val = val_dataset.map(preprocess, batched=True)


training_args = TrainingArguments(
    output_dir="./mt5_te_en_to_te",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
100,27.5834


TrainOutput(global_step=130, training_loss=24.403461397611178, metrics={'train_runtime': 123.877, 'train_samples_per_second': 4.036, 'train_steps_per_second': 1.049, 'total_flos': 264374845440000.0, 'train_loss': 24.403461397611178, 'epoch': 10.0})

In [8]:
import torch
import re


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

examples = [
    ("భారతదేశ రాజధాని ఏది?", "India's capital is New Delhi."),
    ("తాజ్ మహల్ ఎక్కడ ఉంది?", "The Taj Mahal is located in Agra, India."),
    ("భారతదేశ కరెన్సీ ఏమిటి?", "The currency of India is the Indian Rupee."),
]

for q, c in examples:
    input_text = (
        f"telugu question: {q} english context: {c}"
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = re.sub(r"<extra_id_\d+>", "", decoded).strip()
    print(f"Q: {q}\nA: {cleaned}\n")


Q: భారతదేశ రాజధాని ఏది?
A: ిత  పరి

Q: తాజ్ మహల్ ఎక్కడ ఉంది?
A: rollerinklegroheja.abinsk  godu కోల్ ਅడ్ en త్రికణో

Q: భారతదేశ కరెన్సీ ఏమిటి?
A: 国产提供తరం .



In [None]:
#!pip install evaluate sacrebleu rouge_score
import evaluate
import pandas as pd
import torch, re


bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

preds, refs = [], []


for i, row in df_te_val.iterrows():
    if not isinstance(row["answer_inlang"], str):
        continue

    question = row["question"]
    context = row["context"]

    input_text = f"telugu question: {question} english context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()

    preds.append(pred)
    refs.append([row["answer_inlang"]])


def evaluate_subset(df_subset, label):
    preds, refs = [], []
    for i, row in df_subset.iterrows():
        if not isinstance(row["answer_inlang"], str):
            continue

        question = row["question"]
        context = row["context"]

        input_text = f"telugu question: {question} english context: {context}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model.generate(**inputs, max_length=64, do_sample=False, num_beams=4)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = re.sub(r"<extra_id_\\d+>", "", pred).strip()

        preds.append(pred)
        refs.append([row["answer_inlang"]])
        return {
            "Type": label,
            "BLEU": round(bleu_result["score"], 2),
            "ROUGE-1": round(rouge_result["rouge1"], 4),
            "ROUGE-2": round(rouge_result["rouge2"], 4),
            "ROUGE-L": round(rouge_result["rougeL"], 4)
        }

bleu_result = bleu.compute(predictions=preds, references=refs)
rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])


data = {
    "Language": ["Telugu"],
    "BLEU": [round(bleu_result["score"], 2)],
    "ROUGE-1": [round(rouge_result["rouge1"], 4)],
    "ROUGE-2": [round(rouge_result["rouge2"], 4)],
    "ROUGE-L": [round(rouge_result["rougeL"], 4)]
}

df_results = pd.DataFrame(data)

print(df_results)


df_ans = df_te_val[df_te_val["answerable"] == True]
df_unans = df_te_val[df_te_val["answerable"] == False]


results = []
results.append(evaluate_subset(df_ans, "Answerable"))
results.append(evaluate_subset(df_unans, "Unanswerable"))


df_results = pd.DataFrame(results)
print(df_results)

  Language  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0   Telugu  0.24      0.0      0.0      0.0
           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0    Answerable  0.24      0.0      0.0      0.0
1  Unanswerable  0.24      0.0      0.0      0.0


In [55]:

print(df_te_train["answerable"].value_counts() , "train")
print(df_te_val["answerable"].value_counts(), "val")

df_te_val[df_te_val["answerable"] == True][["question", "context", "answer_inlang"]]



answerable
False    45
True      5
Name: count, dtype: int64 train
answerable
False    93
True      7
Name: count, dtype: int64 val


Unnamed: 0,question,context,answer_inlang
2912,మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు?,Munna is a 2007 Telugu movie released on May 2...,హరీష్ జైరాజ్
2930,విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?,Gods do not come to take possessions. Observin...,త్రిశంకు
2932,సింగిరెడ్డి నారాయణరెడ్డి జ్ఞానపీఠ పురస్కారం ను...,"C.Na.Re. Singireddy Narayana Reddy (July 29, 1...",1988
3003,2011 జనగణన ప్రకారం గొట్టిప్రోలు గ్రామములో ఎన్న...,"Gottiprolu is a Village in Naidupet Mandal, Sr...",511
3004,2011 జనగణన ప్రకారం పెదలోవ గ్రామములో ఎన్ని ఇళ్ల...,Pedalova is a village belonging to Pedabayalu ...,23
3006,2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషు...,Reyyalagadda is a village belonging to Gangara...,37
3007,2011 జనాభా లెక్కల ప్రకారం బూతుమిల్లిపాడు గ్రామ...,Boothumillipadu is a village in Gannavaram man...,433


In [7]:
df_te_val[df_te_val["answerable"] == False][["question", "context", "answer_inlang"]].sample(5)


Unnamed: 0,question,context,answer_inlang
2954,బి.గోపాల్ దర్శకుడి తల్లిదండ్రుల పేర్లేమిటి?,Baggidi Gopal popularly known as B. Gopal is a...,"వెంకటేశ్వర్లు, తల్లి మహాలక్షమ్మ"
2936,2011 నాటికి పెద యాచవరం గ్రామ జనాభా ఎంత?,The following items are being produced. Pratti...,4610
2969,2011 నాటికి పెద యాచవరం గ్రామ జనాభా ఎంత?,Pochavaram is a village in Vatsavai Mandal of ...,4610
2986,కెనడా దేశ మొదటి ముఖ్యమంత్రి ఎవరు?,This article is a list of the prime ministers ...,సర్ జాన్ అలెగ్జాండర్ మెక్‌డోనాల్డ్
2911,మలేరియా వ్యాధి కి మందు కనిపెట్టిన శాస్త్రవేత్త...,Malaria is a disease spread by mosquitoes. Mal...,హన్స్ ఆండర్సాగ్


In [36]:
import re
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def show_model_behavior(example, label, model_number):
    question = example["question"]
    context = example["context"]
    gold = example["answer_inlang"]

    if model_number == 1:
        input_text = f"telugu question: {question} english context: {context}"
    elif model_number == 2:
        input_text = f"Telugu question: {question}"
    elif model_number == 3:
         input_text = f"Telugu question: {question}"
    else:
        raise ValueError("Invalid model number (choose 1, 2, or 3)")

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()


    print(f"\n===== {label.upper()} EXAMPLE =====")
    print(f"Question: {question}")
    print(f"Predicted: {pred}")
    print(f"Target: {gold}")
    print(f"Context snippet: {context[:150]}...")
    print("=" * 60)


ans_examples = df_te_val[df_te_val["answerable"] == True].sample(3, random_state=None)
unans_examples = df_te_val[df_te_val["answerable"] == False].sample(3, random_state=None)


for _, ex in ans_examples.iterrows():
    show_model_behavior(ex, "Answerable", 1)

for _, ex in unans_examples.iterrows():
    show_model_behavior(ex, "Unanswerable", 1)





===== ANSWERABLE EXAMPLE =====
Question: మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు?
Predicted: მერулан  агониర్ట్
Target: హరీష్ జైరాజ్
Context snippet: Munna is a 2007 Telugu movie released on May 2. Directed by Paidipalli Vamsi, the film stars Prabhas, Ileana, Prakash Raj, Kota Srinivasa Rao, Rahul D...

===== ANSWERABLE EXAMPLE =====
Question: విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?
Predicted: http檿
Target: త్రిశంకు
Context snippet: Gods do not come to take possessions. Observing this, Vishvamitra sends Trishanku to heaven in his body. Seeing that, Indra told Trishanku that you, w...

===== ANSWERABLE EXAMPLE =====
Question: 2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషుల సంఖ్య ఎంత?
Predicted: ッター니다sulullahಂತರ:0! الدین odendronక
Target: 37
Context snippet: Reyyalagadda is a village belonging to Gangaraju Madugula Mandal, Visakhapatnam District. It is 28 km from Gangaraju Madugu, the mandal centre. m. In ...

===== UNANSWERABLE EXAMPLE =====
Question: భరత్ అనే నేను చిత్ర నిర్మాత ఎవరు?


In [None]:
import random
for i in random.sample(range(len(df_te_val)), 5):
    q = df_te_val.iloc[i]["question"]
    c = df_te_val.iloc[i]["context"]
    target = df_te_val.iloc[i]["answer_inlang"]

    inputs = tokenizer(f"telugu question: {q} english context: {c}", return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = pred.replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()
    print(f"\nQ: {q} \nContext {c} \nPred: {pred}\nTarget: {target}")



Q: మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు? 
Context the Bengali novel "Shaheb Bibi Golam" by Bimal Mitra. The film stars Meena Kumari, Guru Dutt, Rehman, Waheeda Rehman and Nazir Hussain. Its music is by Hemant Kumar and the lyrics are by Shakeel Badayuni. The film is also noted for its brilliant cinematography by V. K. Murthy and the famous songs "Na Jao Saiyaan Chhuda Ke Baiyan" and "Piya Aiso Jiya Mein" sung by Geeta Dutt. In "Sahib Bibi Aur Ghulam" Kumari played the character of Chhoti Bahu. For "Sahib Bibi Aur Ghulam", in order to support a drooping heavy look which is associated with immoderate consumption of liquor, 
Pred: thuuvudанъοθετžnostitiketter అbood趵 Advert సం 蚍δία
Target: హరీష్ జైరాజ్

Q: మలేరియా వ్యాధి కి మందు కనిపెట్టిన శాస్త్రవేత్త ఎవరు? 
Context He said. This, along with other discoveries later earned him the Nobel Prize in 1907. The name Plasmodium was given to this parasite discovered by Alphonse by two Italian scientists named "Ettore Marchiafava" and "Angelo Che

## Second Model - Telegu Question -> Telegu Answer (mt5-small)

In [18]:
from transformers import T5Tokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer



model_name = "google/mt5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_question_only(examples):
    inputs = [f"Telugu question: {q}" for q in examples["question"]]
    targets = examples["answer_inlang"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs



tokenized_train = train_dataset.map(preprocess_question_only, batched=True)
tokenized_val = val_dataset.map(preprocess_question_only, batched=True)



training_args = TrainingArguments(
    output_dir="./mt5_te_en_to_te",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=39, training_loss=43.73628117487981, metrics={'train_runtime': 88.3923, 'train_samples_per_second': 1.697, 'train_steps_per_second': 0.441, 'total_flos': 39656226816000.0, 'train_loss': 43.73628117487981, 'epoch': 3.0})

In [None]:
import torch
import re


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

examples = [
    "భారతదేశ రాజధాని ఏది?",
    "తాజ్ మహల్ ఎక్కడ ఉంది?",
    "భారతదేశ కరెన్సీ ఏమిటి?"
]

for q in examples:
    input_text = (
        f"Telugu question: {q} "
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = re.sub(r"<extra_id_\d+>", "", decoded).strip()
    print(f"Q: {q}\nA: {cleaned}\n")


Q: భారతదేశ రాజధాని ఏది?
A: -სამართ

Q: తాజ్ మహల్ ఎక్కడ ఉంది?
A: ও

Q: భారతదేశ కరెన్సీ ఏమిటి?
A: മുട  entzündunglaf  োহ岿টার্



In [None]:
preds, refs = [], []


for i, row in df_te_val.iterrows():
    if not isinstance(row["answer_inlang"], str):
        continue

    question = row["question"]


    input_text = f"Telugu question: {question}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()

    preds.append(pred)
    refs.append([row["answer_inlang"]])

bleu_result = bleu.compute(predictions=preds, references=refs)
rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])


data = {
    "Language": ["Telugu"],
    "BLEU": [round(bleu_result["score"], 2)],
    "ROUGE-1": [round(rouge_result["rouge1"], 4)],
    "ROUGE-2": [round(rouge_result["rouge2"], 4)],
    "ROUGE-L": [round(rouge_result["rougeL"], 4)]
}



df_results = pd.DataFrame(data)

print(df_results)




  Language  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0   Telugu  0.57      0.0      0.0      0.0


In [None]:
def evaluate_subset(df_subset, label):
    preds, refs = [], []
    for i, row in df_subset.iterrows():
        if not isinstance(row["answer_inlang"], str):
            continue

        question = row["question"]

        input_text = f"Telugu question: {question}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model.generate(**inputs, max_length=64, do_sample=False, num_beams=4)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = re.sub(r"<extra_id_\\d+>", "", pred).strip()

        preds.append(pred)
        refs.append([row["answer_inlang"]])

    bleu_result = bleu.compute(predictions=preds, references=refs)
    rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])

    return {
        "Type": label,
        "BLEU": round(bleu_result["score"], 2),
        "ROUGE-1": round(rouge_result["rouge1"], 4),
        "ROUGE-2": round(rouge_result["rouge2"], 4),
        "ROUGE-L": round(rouge_result["rougeL"], 4)
    }

df_ans = df_te_val[df_te_val["answerable"] == True]
df_unans = df_te_val[df_te_val["answerable"] == False]

results = []
results.append(evaluate_subset(df_ans, "Answerable"))
results.append(evaluate_subset(df_unans, "Unanswerable"))

df_results = pd.DataFrame(results)
print(df_results)

           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0    Answerable  0.00      0.0      0.0      0.0
1  Unanswerable  0.17      0.0      0.0      0.0


In [37]:

ans_examples = df_te_val[df_te_val["answerable"] == True].sample(3, random_state=None)
unans_examples = df_te_val[df_te_val["answerable"] == False].sample(3, random_state=None)


for _, ex in ans_examples.iterrows():
    show_model_behavior(ex, "Answerable", 2)

for _, ex in unans_examples.iterrows():
    show_model_behavior(ex, "Unanswerable", 2)



===== ANSWERABLE EXAMPLE =====
Question: విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?
Predicted: ) నైజ్drej
Target: త్రిశంకు
Context snippet: Gods do not come to take possessions. Observing this, Vishvamitra sends Trishanku to heaven in his body. Seeing that, Indra told Trishanku that you, w...

===== ANSWERABLE EXAMPLE =====
Question: 2011 జనగణన ప్రకారం గొట్టిప్రోలు గ్రామములో ఎన్ని ఇళ్లులు ఉన్నాయి?
Predicted: リフト僇 uratorైర్iös
Target: 511
Context snippet: Gottiprolu is a Village in Naidupet Mandal, Sri Potti Sriramulu Nellore District, Andhra Pradesh State. It is 18 km from the mandal center Naidupet. m...

===== ANSWERABLE EXAMPLE =====
Question: 2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషుల సంఖ్య ఎంత?
Predicted: ward  σε: Komb లీtinente
Target: 37
Context snippet: Reyyalagadda is a village belonging to Gangaraju Madugula Mandal, Visakhapatnam District. It is 28 km from Gangaraju Madugu, the mandal centre. m. In ...

===== UNANSWERABLE EXAMPLE =====
Question: భరత్ అనే నేను చిత్ర నిర్

In [None]:
import random
for i in random.sample(range(len(df_te_val)), 5):
    q = df_te_val.iloc[i]["question"]

    target = df_te_val.iloc[i]["answer_inlang"]

    inputs = tokenizer(f"telugu question: {q}", return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = pred.replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()
    print(f"\nQ: {q} \nContext {c} \nPred: {pred}\nTarget: {target}")



Q: ఏ ఉష్ణోగ్రత వద్ద నీరు ఆవిరిగా మారుతుంది? 
Context Pierre Fauquard was one of the first to disprove the idea that worms cause tooth decay. Sugar is said to be harmful to teeth and gums. Another remarkable increase in the prevalence of dental caries occurred in the 1850s. The reason for that is thought to be more dietary changes. Earlier cervical caries were recognized as the most frequent caries. But the exponential increase in the availability of sugar cane, refined flour, bread and sweetened teas has led to an increase in the number of cavities and fissure caries. In the 1890s W.D. Miller conducted a series of studies on this. They enabled him to propose an analysis of dental caries that influenced contemporary theories. He noted that microbes colonize the mouth and produce acids that damage tooth structures in the presence of fermentable carbohydrates. This explanation 
Pred: ందుకు .?>
Target: (100 °సెం.)

Q: ఈస్ట్ ఇండియా కంపెనీ భారతదేశంలోకి ఎప్పుడు వచ్చింది? 
Context Pierre Fauq

## Third Model Telegu Question -> English Answer (flan-t5-small)

In [49]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

model_name = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_te_to_en(examples):
    inputs = [f"Telegu question: {q}" for q in examples["question"]]
    targets = examples["answer"]  # English answers
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_train = train_dataset.map(preprocess_te_to_en, batched=True)
tokenized_val = val_dataset.map(preprocess_te_to_en, batched=True)

training_args = TrainingArguments(
    output_dir="./mt5_te_en_to_te",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=39, training_loss=14.264578012319712, metrics={'train_runtime': 27.1667, 'train_samples_per_second': 5.521, 'train_steps_per_second': 1.436, 'total_flos': 13941787852800.0, 'train_loss': 14.264578012319712, 'epoch': 3.0})

In [51]:
import torch
import re


device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

examples = [
    "భారతదేశ రాజధాని ఏది?",
    "తాజ్ మహల్ ఎక్కడ ఉంది?",
    "భారతదేశ కరెన్సీ ఏమిటి?"
]

for q in examples:
    input_text = (
        f"Telugu question: {q} "
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = re.sub(r"<extra_id_\d+>", "", decoded).strip()
    print(f"Q: {q}\nA: {cleaned}\n")


Q: భారతదేశ రాజధాని ఏది?
A: Transmission

Q: తాజ్ మహల్ ఎక్కడ ఉంది?
A: stray

Q: భారతదేశ కరెన్సీ ఏమిటి?
A: capacitor



In [None]:

preds, refs = [], []


for i, row in df_te_val.iterrows():
    if not isinstance(row["answer"], str):
        continue

    question = row["question"]
    input_text = f"Telegu question: {question}"

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()

    preds.append(pred)
    refs.append([row["answer"]])



data = {
    "Language": ["Telugu"],
    "BLEU": [round(bleu_result["score"], 2)],
    "ROUGE-1": [round(rouge_result["rouge1"], 4)],
    "ROUGE-2": [round(rouge_result["rouge2"], 4)],
    "ROUGE-L": [round(rouge_result["rougeL"], 4)]
}

df_results = pd.DataFrame(data)
print(df_results)






  Language  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0   Telugu  0.79   0.0029      0.0   0.0029


In [None]:


df_ans = df_te_val[df_te_val["answerable"] == True]
df_unans = df_te_val[df_te_val["answerable"] == False]


results = []
results.append(evaluate_subset(df_ans, "Answerable"))
results.append(evaluate_subset(df_unans, "Unanswerable"))

df_results = pd.DataFrame(results)
print(df_results)


           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0    Answerable  0.00      0.0      0.0      0.0
1  Unanswerable  0.18      0.0      0.0      0.0


In [52]:





ans_examples = df_te_val[df_te_val["answerable"] == True].sample(5, random_state=None)
unans_examples = df_te_val[df_te_val["answerable"] == False].sample(3, random_state=None)

for _, ex in ans_examples.iterrows():
    show_model_behavior(ex, "Answerable", 3)

for _, ex in unans_examples.iterrows():
    show_model_behavior(ex, "Unanswerable", 3)



===== ANSWERABLE EXAMPLE =====
Question: మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు?
Predicted: 
Target: హరీష్ జైరాజ్
Context snippet: Munna is a 2007 Telugu movie released on May 2. Directed by Paidipalli Vamsi, the film stars Prabhas, Ileana, Prakash Raj, Kota Srinivasa Rao, Rahul D...

===== ANSWERABLE EXAMPLE =====
Question: విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?
Predicted: 
Target: త్రిశంకు
Context snippet: Gods do not come to take possessions. Observing this, Vishvamitra sends Trishanku to heaven in his body. Seeing that, Indra told Trishanku that you, w...

===== ANSWERABLE EXAMPLE =====
Question: 2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషుల సంఖ్య ఎంత?
Predicted: s, etc
Target: 37
Context snippet: Reyyalagadda is a village belonging to Gangaraju Madugula Mandal, Visakhapatnam District. It is 28 km from Gangaraju Madugu, the mandal centre. m. In ...

===== ANSWERABLE EXAMPLE =====
Question: సింగిరెడ్డి నారాయణరెడ్డి జ్ఞానపీఠ పురస్కారం ను ఎప్పుడు అందుకున్నాడు ?
Predicted: 
Target

In [54]:
import random
for i in random.sample(range(len(df_te_val)), 5):
    q = df_te_val.iloc[i]["question"]

    target = df_te_val.iloc[i]["answer"]

    inputs = tokenizer(f"Telugu question: {q}", return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = pred.replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()
    print(f"\nQ: {q} \nPred: {pred}\nTarget: {target}")



Q: ఏ ఉష్ణోగ్రత వద్ద నీరు ఆవిరిగా మారుతుంది? 
Pred: 
Target: (100 °C)

Q: ఈస్ట్ ఇండియా కంపెనీ భారతదేశంలోకి ఎప్పుడు వచ్చింది? 
Pred: s
Target: 1608

Q: యానాం యొక్క విస్తీర్ణం ఎంత ? 
Pred: Indians
Target: 30 sq. km

Q: 2011 జనగణన ప్రకారం మహావా గ్రామంలో ఎంతమంది స్త్రీలు ఉన్నారు? 
Pred: sliding
Target: 548 g

Q: మనిషి చనిపోయాక ఏ అవయవం ఎక్కువ సమయం పనిచేస్తుంది? 
Pred: movement
Target: spine
