###WEEK 39

In [None]:
import os
from datasets import load_dataset

os.environ["WANDB_DISABLED"] = "true"

dataset = load_dataset("coastalcph/tydi_xor_rc")
df_train = dataset["train"].to_pandas()
df_val = dataset["validation"].to_pandas()

df_te_train = df_train[df_train["lang"] == "te"]
df_te_val = df_val[df_val["lang"] == "te"]

df_te_train_with_ans = df_te_train[df_te_train["answer_inlang"].notna()]

df_te_val_with_ans = df_te_val[df_te_val["answer_inlang"].notna()]

README.md: 0.00B [00:00, ?B/s]

train.parquet:   0%|          | 0.00/6.88M [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/4.80k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15343 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3011 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4 [00:00<?, ? examples/s]

#Statistics about Telegu subset with 'answer_lang'

In [None]:
import pandas as pd
from datasets import Dataset

# Keeping only rows where the language column ('lang') is Telugu ("te")
# and where the translated answer ('answer_inlang') is not missing

df_te_train = df_train[df_train["lang"] == "te"]
df_te_train = df_te_train[df_te_train["answer_inlang"].notna()]

df_te_val = df_val[df_val["lang"] == "te"]
df_te_val = df_te_val[df_te_val["answer_inlang"].notna()]

train_dataset = Dataset.from_pandas(df_te_train)
val_dataset = Dataset.from_pandas(df_te_val)


train_pairs = df_te_train[["question", "answer_inlang"]].drop_duplicates()
val_pairs = df_te_val[["question", "answer_inlang"]].drop_duplicates()

#Counting duplicates
train_duplicates = len(df_te_train) - len(train_pairs)
val_duplicates = len(df_te_val) - len(val_pairs)

#Overlaps between train and val
merged = pd.merge(train_pairs, val_pairs, on=["question", "answer_inlang"], how="inner")
overlap_count = len(merged)

#Unique-only validation pairs
val_unique_only = len(val_pairs) - overlap_count

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

print("\n--- TRAIN ---")
print(f"Duplicates: {train_duplicates}")
print(f"Unique pairs: {len(train_pairs)}")

print("\n--- VALIDATION ---")
print(f"Duplicates: {val_duplicates}")
print(f"Unique pairs: {len(val_pairs)}")
print(f"Same with train: {overlap_count}")
print(f"Unique (not in train): {val_unique_only}")



Train dataset size: 50
Validation dataset size: 100

--- TRAIN ---
Duplicates: 16
Unique pairs: 34

--- VALIDATION ---
Duplicates: 52
Unique pairs: 48
Same with train: 34
Unique (not in train): 14


## First Model - Telegu Question + English Context -> Telegu Answer (mT5-small-finetuned-tydiqa-for-xqa)

In [None]:
from datasets import Dataset
from transformers import T5Tokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer

#Fine-tuning mt5-small-l-finetuned-tydiqa-for-xqa to generate Telugu answers using tTelugu question and english context as input

# Loading pretrained model and tokenizer

model_name =  "mrm8488/mT5-small-finetuned-tydiqa-for-xqa"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

def preprocess(examples):

    inputs = [
        f"telegu question: {q} english context: {c}"
        for q, c in zip(examples["question"], examples["context"]) # Combing Telugu question and English context into a single input sequence
    ]
    targets = examples["answer_inlang"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize datasets

tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_val = val_dataset.map(preprocess, batched=True)


training_args = TrainingArguments(
    output_dir="./mt5_te_en_to_te",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=40,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()


You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.
Some weights of MT5ForConditionalGeneration were not initialized from the model checkpoint at mrm8488/mT5-small-finetuned-tydiqa-for-xqa and are newly initialized: ['decoder.block.0.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.0.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.2.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.2.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.3.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.3.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.4.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.4.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.5.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.5.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.6.layer.2.DenseReluDense.w

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
100,21.9207
200,0.3588
300,0.2377
400,0.1682
500,0.1263


TrainOutput(global_step=520, training_loss=4.391806266858027, metrics={'train_runtime': 561.9531, 'train_samples_per_second': 3.559, 'train_steps_per_second': 0.925, 'total_flos': 1057499381760000.0, 'train_loss': 4.391806266858027, 'epoch': 40.0})

In [5]:
#!!!!! Only to be run to save the model to be used in week41+ !!!!!
trainer.save_model("./mt5_te_en_to_te_final")
tokenizer.save_pretrained("./mt5_te_en_to_te_final")

('./mt5_te_en_to_te_final/tokenizer_config.json',
 './mt5_te_en_to_te_final/special_tokens_map.json',
 './mt5_te_en_to_te_final/spiece.model',
 './mt5_te_en_to_te_final/added_tokens.json')

In [None]:
import torch
import re
##Trying examples

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

examples = [
    ("భారతదేశ రాజధాని ఏది?", "India's capital is New Delhi."),
    ("తాజ్ మహల్ ఎక్కడ ఉంది?", "The Taj Mahal is located in Agra, India."),
    ("భారతదేశ కరెన్సీ ఏమిటి?", "The currency of India is the Indian Rupee."),
]

for q, c in examples:
    input_text = (
        f"telugu question: {q} english context: {c}"
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = re.sub(r"<extra_id_\d+>", "", decoded).strip()
    print(f"Q: {q}\nA: {cleaned}\n")


Q: భారతదేశ రాజధాని ఏది?
A: భారత భారత కన భారత భారత జన భారతవాత భారత ప్రభుత్వ భారత భారత కేంద్ర రాష్ట్ర భారత కేంద్రదేశాల రాష్ట్రలె భారత భారత విస్తదు రాష్ట్ర ప్ర భారత

Q: తాజ్ మహల్ ఎక్కడ ఉంది?
A: బాంగ్జాంజ్మీ్మీ తెలుగుంగ్టెక్اتصالాబ్ ucha్మీసిస్]).国語 తెలుగుీస్్మీডিয়া తెలుగుదేశాల్మీ తెలుగుంబబ్ల తెలుగుకెట్దేశాల మహా భారతసాగ్రదేశాలసరిదేశాలగ్ГЭయూదేశాలదేశాలదేశాల ఉత్త...).దేశమీాబాద్

Q: భారతదేశ కరెన్సీ ఏమిటి?
A: భారత ప్ర



In [None]:
#!pip install evaluate sacrebleu rouge_score
import evaluate
import pandas as pd
import torch, re

##metrics

bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

preds, refs = [], []


for i, row in df_te_val.iterrows():
    if not isinstance(row["answer_inlang"], str):
        continue

    question = row["question"]
    context = row["context"]

    input_text = f"telugu question: {question} english context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip() ##removing unwanted placeholder tokens (<extra_id_#>) that mT5 sometimes outputs

    preds.append(pred)
    refs.append([row["answer_inlang"]])


def evaluate_subset(df_subset, label):
    preds, refs = [], []
    for i, row in df_subset.iterrows():
        if not isinstance(row["answer_inlang"], str):
            continue

        question = row["question"]
        context = row["context"]

        input_text = f"telugu question: {question} english context: {context}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model.generate(**inputs, max_length=64, do_sample=False, num_beams=4)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = re.sub(r"<extra_id_\\d+>", "", pred).strip()

        preds.append(pred)
        refs.append([row["answer_inlang"]])
        return {
            "Type": label,
            "BLEU": round(bleu_result["score"], 2),
            "ROUGE-1": round(rouge_result["rouge1"], 4),
            "ROUGE-2": round(rouge_result["rouge2"], 4),
            "ROUGE-L": round(rouge_result["rougeL"], 4)
        }



bleu_result = bleu.compute(predictions=preds, references=refs)
rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])


data = {
    "Language": ["Telugu"],
    "BLEU": [round(bleu_result["score"], 2)],
    "ROUGE-1": [round(rouge_result["rouge1"], 4)],
    "ROUGE-2": [round(rouge_result["rouge2"], 4)],
    "ROUGE-L": [round(rouge_result["rougeL"], 4)]
}

df_results = pd.DataFrame(data)

print(df_results)

#Splitting answarble vs unanswerble

df_ans = df_te_val[df_te_val["answerable"] == True]
df_unans = df_te_val[df_te_val["answerable"] == False]


results = []
results.append(evaluate_subset(df_ans, "Answerable"))
results.append(evaluate_subset(df_unans, "Unanswerable"))


df_results = pd.DataFrame(results)
print(df_results)

  Language  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0   Telugu  0.27      0.0      0.0      0.0
           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0    Answerable  0.27      0.0      0.0      0.0
1  Unanswerable  0.27      0.0      0.0      0.0


In [None]:
#Dataset balance (how many answerable/unanswerable examples)

print(df_te_train["answerable"].value_counts() , "train")
print(df_te_val["answerable"].value_counts(), "val")

df_te_val[df_te_val["answerable"] == True][["question", "context", "answer_inlang"]]

answerable
False    45
True      5
Name: count, dtype: int64 train
answerable
False    93
True      7
Name: count, dtype: int64 val


Unnamed: 0,question,context,answer_inlang
2912,మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు?,Munna is a 2007 Telugu movie released on May 2...,హరీష్ జైరాజ్
2930,విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?,Gods do not come to take possessions. Observin...,త్రిశంకు
2932,సింగిరెడ్డి నారాయణరెడ్డి జ్ఞానపీఠ పురస్కారం ను...,"C.Na.Re. Singireddy Narayana Reddy (July 29, 1...",1988
3003,2011 జనగణన ప్రకారం గొట్టిప్రోలు గ్రామములో ఎన్న...,"Gottiprolu is a Village in Naidupet Mandal, Sr...",511
3004,2011 జనగణన ప్రకారం పెదలోవ గ్రామములో ఎన్ని ఇళ్ల...,Pedalova is a village belonging to Pedabayalu ...,23
3006,2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషు...,Reyyalagadda is a village belonging to Gangara...,37
3007,2011 జనాభా లెక్కల ప్రకారం బూతుమిల్లిపాడు గ్రామ...,Boothumillipadu is a village in Gannavaram man...,433


In [None]:
df_te_val[df_te_val["answerable"] == False][["question", "context", "answer_inlang"]].sample(5)


Unnamed: 0,question,context,answer_inlang
2954,బి.గోపాల్ దర్శకుడి తల్లిదండ్రుల పేర్లేమిటి?,Baggidi Gopal popularly known as B. Gopal is a...,"వెంకటేశ్వర్లు, తల్లి మహాలక్షమ్మ"
2936,2011 నాటికి పెద యాచవరం గ్రామ జనాభా ఎంత?,The following items are being produced. Pratti...,4610
2969,2011 నాటికి పెద యాచవరం గ్రామ జనాభా ఎంత?,Pochavaram is a village in Vatsavai Mandal of ...,4610
2986,కెనడా దేశ మొదటి ముఖ్యమంత్రి ఎవరు?,This article is a list of the prime ministers ...,సర్ జాన్ అలెగ్జాండర్ మెక్‌డోనాల్డ్
2911,మలేరియా వ్యాధి కి మందు కనిపెట్టిన శాస్త్రవేత్త...,Malaria is a disease spread by mosquitoes. Mal...,హన్స్ ఆండర్సాగ్


In [None]:
import re
import torch


## model behavior by predictions from mT5 model
##for answerable and unanswerable Telugu questions

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def show_model_behavior(example, label, model_number):
    question = example["question"]
    context = example["context"]
    gold = example["answer_inlang"]

    if model_number == 1:
        input_text = f"telugu question: {question} english context: {context}"
    elif model_number == 2:
        input_text = f"Telugu question: {question}"
    elif model_number == 3:
         input_text = f"Telugu question: {question}"
    else:
        raise ValueError("Invalid model number (choose 1, 2, or 3)")

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()


    print(f"\n===== {label.upper()} EXAMPLE =====")
    print(f"Question: {question}")
    print(f"Predicted: {pred}")
    print(f"Target: {gold}")
    print(f"Context snippet: {context[:150]}...")
    print("=" * 60)


ans_examples = df_te_val[df_te_val["answerable"] == True].sample(3, random_state=None)
unans_examples = df_te_val[df_te_val["answerable"] == False].sample(3, random_state=None)


for _, ex in ans_examples.iterrows():
    show_model_behavior(ex, "Answerable", 1)

for _, ex in unans_examples.iterrows():
    show_model_behavior(ex, "Unanswerable", 1)



===== ANSWERABLE EXAMPLE =====
Question: మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు?
Predicted: ిష్టెక్దేశాలిల్ల భారతుము భారతుము దేశ భారతాలిిష్టాలసాగిష్ిష్ భారతదేశాలخريిష్
Target: హరీష్ జైరాజ్
Context snippet: Munna is a 2007 Telugu movie released on May 2. Directed by Paidipalli Vamsi, the film stars Prabhas, Ileana, Prakash Raj, Kota Srinivasa Rao, Rahul D...

===== ANSWERABLE EXAMPLE =====
Question: విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?
Predicted: మీ kedu keduుము
Target: త్రిశంకు
Context snippet: Gods do not come to take possessions. Observing this, Vishvamitra sends Trishanku to heaven in his body. Seeing that, Indra told Trishanku that you, w...

===== ANSWERABLE EXAMPLE =====
Question: 2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషుల సంఖ్య ఎంత?
Predicted: కి రాజ
Target: 37
Context snippet: Reyyalagadda is a village belonging to Gangaraju Madugula Mandal, Visakhapatnam District. It is 28 km from Gangaraju Madugu, the mandal centre. m. In ...

===== UNANSWERABLE EXAMPLE =====
Questi

In [None]:
import random

#Sampling random examples from the Telugu validation dataset
#and comparing the model's generated answers with the true answers

for i in random.sample(range(len(df_te_val)), 5):
    q = df_te_val.iloc[i]["question"]
    c = df_te_val.iloc[i]["context"]
    target = df_te_val.iloc[i]["answer_inlang"]

    inputs = tokenizer(f"telugu question: {q} english context: {c}", return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = pred.replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()
    print(f"\nQ: {q} \nContext {c} \nPred: {pred}\nTarget: {target}")


Q: ఏ ఉష్ణోగ్రత వద్ద నీరు ఆవిరిగా మారుతుంది? 
Context Solubility is the maximum amount of solute present in 100 grams of solvent at constant temperature. For example 100 grams of water can only dissolve 36.3 grams of salt. Thus the solubility of the salt becomes 36.3. Gases also dissolve in various solvents. For example, soda is made when carbon dioxide is added to water. When the temperature is increased, the solubility decreases rapidly and water forms. 
Pred: ఫ్రకీ ప్ర
Target: (100 °సెం.)

Q: ఈస్ట్ ఇండియా కంపెనీ భారతదేశంలోకి ఎప్పుడు వచ్చింది? 
Context The connection between each other is called the All Red Line. The East India Company was responsible for the expansion of the British Empire in Asia. The Company's army first joined the Royal Navy in the Seven Years' War, and both continued to cooperate outside India: the expulsion of Napoleon from Egypt (1799), the subjugation of Java from the Netherlands (1811), the capture of Malacca (1824) and Singapore (1819). Contributed in activ

## Second Model - Telegu Question -> Telegu Answe(mT5-small-finetuned-tydiqa-for-xqa)

In [None]:
from transformers import T5Tokenizer, MT5ForConditionalGeneration, TrainingArguments, Trainer

#Fine-tuning mt5-small-l-finetuned-tydiqa-for-xqa to generate Telugu answers using only the Telugu question as input

model_name =  "mrm8488/mT5-small-finetuned-tydiqa-for-xqa"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_question_only(examples):
    inputs = [f"Telugu question: {q}" for q in examples["question"]] ## input for this model
    targets = examples["answer_inlang"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs



tokenized_train = train_dataset.map(preprocess_question_only, batched=True)
tokenized_val = val_dataset.map(preprocess_question_only, batched=True)



training_args = TrainingArguments(
    output_dir="./mt5_te_en_to_te2",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=40,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()


You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.
Some weights of MT5ForConditionalGeneration were not initialized from the model checkpoint at mrm8488/mT5-small-finetuned-tydiqa-for-xqa and are newly initialized: ['decoder.block.0.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.0.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.1.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.2.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.2.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.3.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.3.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.4.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.4.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.5.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.5.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.6.layer.2.DenseReluDense.w

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
100,34.9661
200,0.5517
300,0.3701
400,0.2909
500,0.2527


TrainOutput(global_step=520, training_loss=7.015484996942374, metrics={'train_runtime': 318.6085, 'train_samples_per_second': 6.277, 'train_steps_per_second': 1.632, 'total_flos': 528749690880000.0, 'train_loss': 7.015484996942374, 'epoch': 40.0})

In [None]:
import torch
import re

#Trying out model with some examples

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

examples = [
    "భారతదేశ రాజధాని ఏది?",
    "తాజ్ మహల్ ఎక్కడ ఉంది?",
    "భారతదేశ కరెన్సీ ఏమిటి?"
]

for q in examples:
    input_text = (
        f"Telugu question: {q} "
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = re.sub(r"<extra_id_\d+>", "", decoded).strip()
    print(f"Q: {q}\nA: {cleaned}\n")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Q: భారతదేశ రాజధాని ఏది?
A: న్నెథ్కు

Q: తాజ్ మహల్ ఎక్కడ ఉంది?
A: 8 ప్రయుెడ్ ఏాన్ెడ్జ్జీెడ్ెడ్దా.

Q: భారతదేశ కరెన్సీ ఏమిటి?
A: థం ప్ర.



In [None]:
#Computing metrics one overvall set

preds, refs = [], []


for i, row in df_te_val.iterrows():
    if not isinstance(row["answer_inlang"], str):
        continue

    question = row["question"]


    input_text = f"Telugu question: {question}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()

    preds.append(pred)
    refs.append([row["answer_inlang"]])

bleu_result = bleu.compute(predictions=preds, references=refs)
rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])


data = {
    "Language": ["Telugu"],
    "BLEU": [round(bleu_result["score"], 2)],
    "ROUGE-1": [round(rouge_result["rouge1"], 4)],
    "ROUGE-2": [round(rouge_result["rouge2"], 4)],
    "ROUGE-L": [round(rouge_result["rougeL"], 4)]
}



df_results = pd.DataFrame(data)

print(df_results)

  Language  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0   Telugu  0.47   0.0167      0.0   0.0167


In [None]:
#Computing metrics on answerble and unanswerble of the subset

def evaluate_subset(df_subset, label):
    preds, refs = [], []
    for i, row in df_subset.iterrows():
        if not isinstance(row["answer_inlang"], str):
            continue

        question = row["question"]

        input_text = f"Telugu question: {question}"
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
        outputs = model.generate(**inputs, max_length=64, do_sample=False, num_beams=4)
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        pred = re.sub(r"<extra_id_\\d+>", "", pred).strip()

        preds.append(pred)
        refs.append([row["answer_inlang"]])

    bleu_result = bleu.compute(predictions=preds, references=refs)
    rouge_result = rouge.compute(predictions=preds, references=[r[0] for r in refs])

    return {
        "Type": label,
        "BLEU": round(bleu_result["score"], 2),
        "ROUGE-1": round(rouge_result["rouge1"], 4),
        "ROUGE-2": round(rouge_result["rouge2"], 4),
        "ROUGE-L": round(rouge_result["rougeL"], 4)
    }

df_ans = df_te_val[df_te_val["answerable"] == True]
df_unans = df_te_val[df_te_val["answerable"] == False]

results = []
results.append(evaluate_subset(df_ans, "Answerable"))
results.append(evaluate_subset(df_unans, "Unanswerable"))

df_results = pd.DataFrame(results)
print(df_results)

           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0    Answerable  2.80   0.1429      0.0   0.1429
1  Unanswerable  0.78   0.0000      0.0   0.0000


In [None]:
## Sampling random examples for answerable and unanswerable examples

ans_examples = df_te_val[df_te_val["answerable"] == True].sample(3, random_state=None)
unans_examples = df_te_val[df_te_val["answerable"] == False].sample(3, random_state=None)


for _, ex in ans_examples.iterrows():
    show_model_behavior(ex, "Answerable", 2)

for _, ex in unans_examples.iterrows():
    show_model_behavior(ex, "Unanswerable", 2)



===== ANSWERABLE EXAMPLE =====
Question: 2011 జనాభా లెక్కల ప్రకారం బూతుమిల్లిపాడు గ్రామ జనాభా ఎంత ?
Predicted: ంప 8..ింగ్)
Target: 433
Context snippet: Boothumillipadu is a village in Gannavaram mandal of Krishna district. It is 7 km from the mandal center Gannavaram. m. In distance, it is 30 km from ...

===== ANSWERABLE EXAMPLE =====
Question: 2011 జనగణన ప్రకారం గొట్టిప్రోలు గ్రామములో ఎన్ని ఇళ్లులు ఉన్నాయి?
Predicted: 2229
Target: 511
Context snippet: Gottiprolu is a Village in Naidupet Mandal, Sri Potti Sriramulu Nellore District, Andhra Pradesh State. It is 18 km from the mandal center Naidupet. m...

===== ANSWERABLE EXAMPLE =====
Question: విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?
Predicted: కమ్పీన్
Target: త్రిశంకు
Context snippet: Gods do not come to take possessions. Observing this, Vishvamitra sends Trishanku to heaven in his body. Seeing that, Indra told Trishanku that you, w...

===== UNANSWERABLE EXAMPLE =====
Question: మునీష్ దలాల్ వృత్తి ఏమిటి?
Predicted: అనాలల్
Target:

In [None]:
import random

#Random sampling Telugu questions from the validation set and checking
#how the fine-tuned mT5 question-only model responds.

for i in random.sample(range(len(df_te_val)), 5):
    q = df_te_val.iloc[i]["question"]

    target = df_te_val.iloc[i]["answer_inlang"]

    inputs = tokenizer(f"telugu question: {q}", return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = pred.replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()
    print(f"\nQ: {q} \nContext {c} \nPred: {pred}\nTarget: {target}")



Q: ఏ ఉష్ణోగ్రత వద్ద నీరు ఆవిరిగా మారుతుంది? 
Context nutrition has the capability to lock a child in a vicious cycle of disease susceptibility and recurring sickness, which threatens cognitive and social development. Undernutrition and bias in access to food and health services leaves children less likely to attend or perform well in school. UNICEF defines undernutrition “as the outcome of insufficient food intake (hunger) and repeated infectious diseases. Under nutrition includes being underweight for one’s age, too short for one’s age (stunted), dangerously thin (wasted), and deficient in vitamins and minerals (micronutrient malnutrient). Under nutrition causes 53% of deaths of children under five across the world. It has 
Pred: (100పు8,0
Target: (100 °సెం.)

Q: ఈస్ట్ ఇండియా కంపెనీ భారతదేశంలోకి ఎప్పుడు వచ్చింది? 
Context nutrition has the capability to lock a child in a vicious cycle of disease susceptibility and recurring sickness, which threatens cognitive and social development. 

## Third Model Telegu Question -> English Answer (flan-t5-small)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# Fine-tuning flan-t5-small for Telugu -> English Question-Answer translation

model_name = "google/flan-t5-small"

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_te_to_en(examples):
    inputs = [f"Telegu question: {q}" for q in examples["question"]] #input format for this model
    targets = examples["answer"]  # English answers
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_train = train_dataset.map(preprocess_te_to_en, batched=True)
tokenized_val = val_dataset.map(preprocess_te_to_en, batched=True)

training_args = TrainingArguments(
    output_dir="./mt5_te_en_to_te3",
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=40,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

trainer.train()


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
100,6.0183
200,0.1565
300,0.0593
400,0.0429
500,0.0382


TrainOutput(global_step=520, training_loss=1.215895195190723, metrics={'train_runtime': 92.0354, 'train_samples_per_second': 21.731, 'train_steps_per_second': 5.65, 'total_flos': 185890504704000.0, 'train_loss': 1.215895195190723, 'epoch': 40.0})

In [None]:
import torch
import re

#Trying out model

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

examples = [
    "భారతదేశ రాజధాని ఏది?",
    "తాజ్ మహల్ ఎక్కడ ఉంది?",
    "భారతదేశ కరెన్సీ ఏమిటి?"
]

for q in examples:
    input_text = (
        f"Telugu question: {q} "
    )
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model.generate(**inputs, max_length=50)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    cleaned = re.sub(r"<extra_id_\d+>", "", decoded).strip()
    print(f"Q: {q}\nA: {cleaned}\n")


Q: భారతదేశ రాజధాని ఏది?
A: 25 km m

Q: తాజ్ మహల్ ఎక్కడ ఉంది?
A: Nigeria

Q: భారతదేశ కరెన్సీ ఏమిటి?
A: 4610



In [None]:
#running metrics on overall set

preds, refs = [], []

for i, row in df_te_val.iterrows():
    if not isinstance(row["answer"], str):
        continue

    question = row["question"]
    input_text = f"Telegu question: {question}"

    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)

    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()

    preds.append(pred)
    refs.append([row["answer"]])

data = {
    "Language": ["Telugu"],
    "BLEU": [round(bleu_result["score"], 2)],
    "ROUGE-1": [round(rouge_result["rouge1"], 4)],
    "ROUGE-2": [round(rouge_result["rouge2"], 4)],
    "ROUGE-L": [round(rouge_result["rougeL"], 4)]
}

df_results = pd.DataFrame(data)
print(df_results)

  Language  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0   Telugu  0.47   0.0167      0.0   0.0167


In [None]:

#Evaluating performance separately for answerable and unanswerable questions

df_ans = df_te_val[df_te_val["answerable"] == True]
df_unans = df_te_val[df_te_val["answerable"] == False]


results = []
results.append(evaluate_subset(df_ans, "Answerable"))
results.append(evaluate_subset(df_unans, "Unanswerable"))

df_results = pd.DataFrame(results)
print(df_results)


           Type  BLEU  ROUGE-1  ROUGE-2  ROUGE-L
0    Answerable  0.00   0.0000      0.0   0.0000
1  Unanswerable  0.27   0.0108      0.0   0.0108


In [None]:


## Sampling random examples for answerable and unanswerable examples

ans_examples = df_te_val[df_te_val["answerable"] == True].sample(5, random_state=None)
unans_examples = df_te_val[df_te_val["answerable"] == False].sample(3, random_state=None)

for _, ex in ans_examples.iterrows():
    show_model_behavior(ex, "Answerable", 3)

for _, ex in unans_examples.iterrows():
    show_model_behavior(ex, "Unanswerable", 3)



===== ANSWERABLE EXAMPLE =====
Question: మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు?
Predicted: 4610
Target: హరీష్ జైరాజ్
Context snippet: Munna is a 2007 Telugu movie released on May 2. Directed by Paidipalli Vamsi, the film stars Prabhas, Ileana, Prakash Raj, Kota Srinivasa Rao, Rahul D...

===== ANSWERABLE EXAMPLE =====
Question: విశ్వామిత్రుడు ఏ స్వర్గాన్ని నిర్మించాడు?
Predicted: United Kingdom
Target: త్రిశంకు
Context snippet: Gods do not come to take possessions. Observing this, Vishvamitra sends Trishanku to heaven in his body. Seeing that, Indra told Trishanku that you, w...

===== ANSWERABLE EXAMPLE =====
Question: 2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషుల సంఖ్య ఎంత?
Predicted: DVV
Target: 37
Context snippet: Reyyalagadda is a village belonging to Gangaraju Madugula Mandal, Visakhapatnam District. It is 28 km from Gangaraju Madugu, the mandal centre. m. In ...

===== ANSWERABLE EXAMPLE =====
Question: సింగిరెడ్డి నారాయణరెడ్డి జ్ఞానపీఠ పురస్కారం ను ఎప్పుడు అందుకున్నాడు ?
Pre

In [None]:
import random

#Random sampling Telugu questions from the validation set and checking
#how the fine-tuned flan-t5-small model responds.


for i in random.sample(range(len(df_te_val)), 5):
    q = df_te_val.iloc[i]["question"]

    target = df_te_val.iloc[i]["answer"]

    inputs = tokenizer(f"Telugu question: {q}", return_tensors="pt", truncation=True).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = pred.replace("<extra_id_0>", "").replace("<extra_id_1>", "").strip()
    pred = re.sub(r"<extra_id_\d+>", "", pred).strip()
    print(f"\nQ: {q} \nPred: {pred}\nTarget: {target}")



Q: మున్నా చిత్రానికి సంగీత దర్శకుడు ఎవరు? 
Pred: France
Target: Harish Jairaj

Q: మలేరియా వ్యాధి కి మందు కనిపెట్టిన శాస్త్రవేత్త ఎవరు? 
Pred: France
Target: Hans Andersag

Q: తెలుగు పంచాంగం ప్రకారం నూతన సంవత్సరం ఏ ఇంగ్లీష్ నెలలో ప్రారంభమవుతుంది? 
Pred: Velagapudi Ramakrishna Siddhartha Engineering College
Target: March or April

Q: 2011 జనగణన ప్రకారం మహావా గ్రామంలో ఎంతమంది స్త్రీలు ఉన్నారు? 
Pred: 2229
Target: 548 g

Q: క్షయ వ్యాధికి విరుగుడు ఏ దేశంలో కనుగొన్నారు? 
Pred: DVV Danaya
Target: France
