#Model Training

In [None]:
!pip install transformers datasets sentencepiece sacrebleu accelerate gdown

    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
!pip install torch==2.0.1 torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118

Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0mInstalling collected packages: tokenizers, transformers
[2K  Attempting uninstall: tokenizers
[2K    Found existing installation: tokenizers 0.19.1
[2K    Uninstalling tokenizers-0.19.1:
[2K      Successfully uninstalled tokenizers-0.19.1
[2K  Attempting uninstall: transformers
[2K    Found existing installation: transformers 4.41.2
[2K    Uninstalling transformers-4.41.2:
[2K      Successfully uninstalled transformers-4.41.25;237m╺[0m[38;5;237m━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [transformers]
[2K   [38;2;114;156;3

In [None]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import MBart50TokenizerFast, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq


device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Load dataset
df = pd.read_csv('./merged_output.csv')
df1 = df.copy()  # Create a copy to avoid any warnings
df1.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True, errors='ignore')  # Handle missing columns gracefully


print("DataFrame shape:", df1.shape)
print("Sample data:\n", df1.head(5))  # Show 5 rows


df1 = df1.rename(columns={"Tamil": "ta", "Telugu": "te"})


dataset = Dataset.from_pandas(df1)
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
split_dataset = DatasetDict({"train": split_dataset["train"], "test": split_dataset["test"]})
print("Train dataset size:", len(split_dataset["train"]))
print("Test dataset size:", len(split_dataset["test"]))

# Load mBART model and tokenizer
MBART_MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(MBART_MODEL_NAME, src_lang="te_IN", tgt_lang="ta_IN")
mbart_model = AutoModelForSeq2SeqLM.from_pretrained(MBART_MODEL_NAME).to(device)


mbart_vocab_size_tokenizer = len(mbart_tokenizer)
mbart_vocab_size_model = mbart_model.get_output_embeddings().weight.size(0)
print("mBART - Initial tokenizer vocab size:", mbart_vocab_size_tokenizer)
print("mBART - Initial model output vocab size:", mbart_vocab_size_model)


if mbart_vocab_size_tokenizer != mbart_vocab_size_model:
    print(f"Warning: mBART vocab size mismatch (Tokenizer: {mbart_vocab_size_tokenizer}, Model: {mbart_vocab_size_model}). Adjusting model embeddings.")
    mbart_model.resize_token_embeddings(mbart_vocab_size_tokenizer)
    print("Post-resize model vocab size:", mbart_model.get_output_embeddings().weight.size(0))
else:
    print("mBART - Vocab sizes match, no adjustment needed.")

# Preprocessing function
def mbart_preprocess_function(examples):
    inputs = [te_text for te_text in examples["te"]]  # Telugu as input
    targets = [ta_text for ta_text in examples["ta"]]  # Tamil as target
    model_inputs = mbart_tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with mbart_tokenizer.as_target_tokenizer():
        labels = mbart_tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    labels = [[-100 if token == mbart_tokenizer.pad_token_id else token for token in seq] for seq in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
mbart_tokenized_datasets = split_dataset.map(
    mbart_preprocess_function,
    batched=True,
    batch_size=1000,
    remove_columns=["ta", "te"]
)
print("mBART - Tokenized train sample:", mbart_tokenized_datasets["train"][0])

# Training arguments
mbart_training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart_finetuned_te_to_ta",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    report_to="none",
    push_to_hub=False,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    logging_steps=500,
    save_steps=5000
)

# Data collator and trainer
mbart_data_collator = DataCollatorForSeq2Seq(mbart_tokenizer, model=mbart_model)
mbart_trainer = Seq2SeqTrainer(
    model=mbart_model,
    args=mbart_training_args,
    train_dataset=mbart_tokenized_datasets["train"],
    eval_dataset=mbart_tokenized_datasets["test"],
    tokenizer=mbart_tokenizer,
    data_collator=mbart_data_collator
)

# Train
print("Training mBART...")
mbart_trainer.train()

# Save
mbart_trainer.save_model("./mbart_finetuned_te_to_ta")
mbart_tokenizer.save_pretrained("./mbart_finetuned_te_to_ta")

# Verify saved model
mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained("./mbart_finetuned_te_to_ta").to(device)
mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained("./mbart_finetuned_te_to_ta", src_lang="te_IN", tgt_lang="ta_IN")
print("mBART - Saved tokenizer vocab size:", len(mbart_saved_tokenizer))
print("mBART - Saved model output vocab size:", mbart_saved_model.get_output_embeddings().weight.size(0))

# Test translation with debugging
def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)
    if debug:
        print("Tokenized Input IDs:", inputs["input_ids"].tolist())
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=256,
        min_length=10,
        num_beams=5,
        early_stopping=False,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["ta_IN"]
    )
    if debug:
        print("Raw Output IDs:", outputs[0].tolist())
        print("Decoded with special tokens:", mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False))
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()

# Test
input_text = "హలో, మీరు ఎలా ఉన్నారు?"  # "Hello, how are you?" in Telugu
translated_text = mbart_translate_text(input_text, debug=True)
print("mBART Translation:", translated_text)

2025-05-05 17:03:18.040713: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-05 17:03:18.053125: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746444798.067569  100316 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746444798.072122  100316 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746444798.082851  100316 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Using device: cuda
DataFrame shape: (547567, 2)
Sample data:
                                                Tamil  \
0     அவள் பெயர் கூட அவளுக்கு ஒன்றும் நினைவில் இல்லை   
1  சமைப்பது வேகமானது இதன் விளைவாக ஊட்டச்சத்துக்கள...   
2  நாம் ஏற்கனவே செய்வதை ரசிப்பதைக் கண்டுபிடிப்பதற...   
3  இது ஒரு மேனுவல் அல்லது ஆட்டோமேட்டிக் கியர்பாக்...   
4                           இதுவும் நல்ல முயற்சிதான்   

                                              Telugu  
0               కనీసం ఆమె పేరు కూడా ఆయనకు గుర్తులేదు  
1  వంట వేగంగా ఉంటుంది తద్వారా పోషకాలు మరియు విటమి...  
2  మనం ఇప్పటికే ఆనందించేదాన్ని గుర్తించడానికి బదు...  
3  ఇది మాన్యువల్ లేదా ఆటోమేటిక్ గేర్బాక్స్తో పెట్...  
4                ఇది కూడా మంచి ఉపయోగ కరమైన ప్రయత్నమే  
Train dataset size: 492810
Test dataset size: 54757
mBART - Initial tokenizer vocab size: 250054
mBART - Initial model output vocab size: 250054
mBART - Vocab sizes match, no adjustment needed.


Map:   0%|          | 0/492810 [00:00<?, ? examples/s]



Map:   0%|          | 0/54757 [00:00<?, ? examples/s]

mBART - Tokenized train sample: {'input_ids': [250045, 60078, 1296, 483, 6, 136571, 27013, 14206, 4276, 103646, 95432, 8197, 55763, 5271, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [250044, 2690, 3770, 63277, 235753, 8182, 15453, 483, 55963, 86322, 78611, 8285, 6149, 80334, 8182, 13184

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  mbart_trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,1.0897,1.052916
2,0.8932,0.953157
3,0.7559,0.919803
4,0.6338,0.912127
5,0.5514,0.923989


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


mBART - Saved tokenizer vocab size: 250054
mBART - Saved model output vocab size: 250054
Tokenized Input IDs: [[250045, 24147, 1296, 4, 22735, 24722, 91064, 32, 2]]
Raw Output IDs: [2, 250044, 39507, 66705, 4, 19238, 29947, 128251, 37961, 32, 2]
Decoded with special tokens: </s>ta_IN ஹலோ, நீ எப்படி இருக்கிறாய்?</s>
mBART Translation: ஹலோ, நீ எப்படி இருக்கிறாய்?


#Testing

In [None]:
import torch
from transformers import MBart50TokenizerFast, AutoModelForSeq2SeqLM
from IPython.display import display, HTML

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the fine-tuned model and tokenizer
MODEL_PATH = "./mbart_finetuned_te_to_ta"  # Ensure this model is fine-tuned for Telugu-to-Tamil
mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH, src_lang="te_IN", tgt_lang="ta_IN")


def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)
    if debug:
        print("Tokenized Input IDs:", inputs["input_ids"].tolist())
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=256,
        min_length=10,
        num_beams=5,
        early_stopping=False,
        length_penalty=1.0,
        no_repeat_ngram_size=2,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["ta_IN"]  # Tamil
    )
    if debug:
        print("Raw Output IDs:", outputs[0].tolist())
        print("Decoded with special tokens:", mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False))
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()


def translate_interactively():
    display(HTML("<h3>Telugu to Tamil Translator</h3>"))
    print("Enter Telugu text below to translate to Tamil (type 'exit' to stop):")
    
    while True:
      
        user_input = input("Telugu Input: ").strip()
        
        #  exit condition
        if user_input.lower() == "exit":
            display(HTML("<p style='color: green;'>Exiting translator...</p>"))
            break
        
        if not user_input:
            display(HTML("<p style='color: red;'>Please enter some text.</p>"))
            continue
        
     
        try:
            translated_text = mbart_translate_text(user_input, debug=False)  # Set debug=True for detailed output
            display(HTML(f"<p><b>Telugu:</b> {user_input}<br><b>Tamil Translation:</b> {translated_text}</p>"))
        except Exception as e:
            display(HTML(f"<p style='color: red;'>Error during translation: {e}</p>"))


translate_interactively()

Using device: cuda


2025-05-06 15:43:58.014780: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-06 15:43:58.026548: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746526438.040229  306520 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746526438.044411  306520 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746526438.054874  306520 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Enter Telugu text below to translate to Tamil (type 'exit' to stop):


Telugu Input:  నేను పుస్తకం చదువుతున్నాను.


Telugu Input:  నీవు రేపు రాగలవా?


Telugu Input:  ఏఐ వినియోగం రోజురోజుకూ విపరీతంగా పెరుగుతోంది.


Telugu Input:  ఈ తరుణంలో ఓపెన్‌ఏఐ (OpenAI) సంస్థ చాట్‌జీపీటీ (ChatGPT) చాట్‌బాట్‌ వంటివి సంచలనాలు సృష్టిస్తుంటే.


Telugu Input:  చాట్ జీపీటీకి యూజర్ బేస్ పెరిగిన నేపథ్యంలో అనేక ఏఐలు పోటీగా మార్కెట్లోకి వస్తున్నాయి.


Telugu Input:  	Telugu 3	2016 సంవత్సరంలో 62.7 బిలియన్‌ డాలర్లు, 2017 65.3 బిలియన్‌ డాలర్లుగా ఉంది.


Telugu Input:  2016 సంవత్సరంలో 62.7 బిలియన్‌ డాలర్లు, 2017 65.3 బిలియన్‌ డాలర్లుగా ఉంది.


Telugu Input:  exit


#Evaluation Metrics

In [None]:
import numpy as np
import pandas as pd
import torch
import logging
from tqdm import tqdm
from datasets import Dataset
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration, AutoConfig, AutoModelForSeq2SeqLM
from sacrebleu import corpus_bleu, corpus_chrf, corpus_ter
from indicnlp.tokenize import indic_tokenize
from bert_score import score as bert_score


try:
    from comet import download_model, load_from_checkpoint
    comet_available = True
except ImportError:
    comet_available = False
    print("COMET not available. Will skip COMET evaluation.")


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

MODEL_PATH = "./mbart_finetuned_te_to_ta"  
DATASET_PATH = "./testing_pairs.csv" 
NUM_SAMPLES = 1260  

# Output paths
BLEU_OUTPUT_PATH = "./mBART_Reverse_RESULTS/bleu_evaluation_results.csv"
BERTSCORE_OUTPUT_PATH = "./mBART_Reverse_RESULTS/bertscore_evaluation_results.csv"
COMET_OUTPUT_PATH = "./mBART_Reverse_RESULTS/comet_evaluation_results.csv"
CHRF_OUTPUT_PATH = "./mBART_Reverse_RESULTS/chrf_evaluation_results.csv"
TER_OUTPUT_PATH = "./mBART_Reverse_RESULTS/ter_evaluation_results.csv"

# Load the model configuration first
logger.info("Loading model configuration...")
config = AutoConfig.from_pretrained(MODEL_PATH)
if hasattr(config, 'generation_config'):
    if config.generation_config.early_stopping is None:
        config.generation_config.early_stopping = True
else:
    config.early_stopping = True


logger.info("Loading model and tokenizer...")
try:
    mbart_saved_model = MBartForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        config=config
    ).to(device)
except:
    mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_PATH,
        config=config
    ).to(device)


if hasattr(mbart_saved_model, 'generation_config'):
    mbart_saved_model.generation_config.early_stopping = True

mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH, src_lang="te_IN", tgt_lang="ta_IN")

# Load the test dataset
logger.info("Loading dataset...")
df = pd.read_csv(DATASET_PATH)
print(f"Dataset columns: {df.columns.tolist()}")
print(f"Dataset shape: {df.shape}")
print(f"First few rows:\n{df.head()}")


telugu_col = None
tamil_col = None


telugu_patterns = ['telugu_sentence', 'telugu', 'source', 'src', 'Telugu', 'telugu_text']
tamil_patterns = ['tamil_sentence', 'tamil', 'target', 'tgt', 'Tamil', 'tamil_text']

for col in df.columns:
    if any(pattern.lower() in col.lower() for pattern in telugu_patterns):
        telugu_col = col
    if any(pattern.lower() in col.lower() for pattern in tamil_patterns):
        tamil_col = col

if telugu_col is None or tamil_col is None:
    raise ValueError(f"Could not identify Telugu and Tamil columns. Available columns: {df.columns.tolist()}")

print(f"Using Telugu column: {telugu_col}")
print(f"Using Tamil column: {tamil_col}")


df = df[[telugu_col, tamil_col]].dropna()

df = df.rename(columns={telugu_col: 'telugu_sentence', tamil_col: 'tamil_sentence'})
test_dataset = Dataset.from_pandas(df)
print(f"Test dataset size: {len(test_dataset)}")


def indic_tokenize_text(text):
    if not text or pd.isna(text):
        return ""
    return ' '.join(indic_tokenize.trivial_tokenize(text, lang='ta'))


def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)
    if debug:
        logger.info(f"Tokenized Input IDs: {inputs['input_ids'].tolist()}")
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=256,
        min_length=10,
        num_beams=5,
        early_stopping=True,
        length_penalty=1.2,
        no_repeat_ngram_size=3,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["ta_IN"]
    )
    if debug:
        logger.info(f"Raw Output IDs: {outputs[0].tolist()}")
        logger.info(f"Decoded with special tokens: {mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False)}")
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()


def generate_translations(dataset, num_samples=NUM_SAMPLES):
    sources = []
    references = []
    hypotheses = []
    
    test_data = dataset.select(range(min(num_samples, len(dataset))))
    logger.info(f"Generating translations for {len(test_data)} samples")
    
    for example in tqdm(test_data, desc="Generating translations"):
        input_text = example["telugu_sentence"]
        reference = example["tamil_sentence"]
        
        try:
            hypothesis = mbart_translate_text(input_text, debug=False)
        except Exception as e:
            logger.warning(f"Error translating '{input_text}': {e}")
            hypothesis = ""
            
        sources.append(input_text)
        references.append(reference)
        hypotheses.append(hypothesis)
    
    return test_data, sources, references, hypotheses

# Compute BLEU score
def compute_bleu(test_data, sources, references, hypotheses):
    logger.info("Computing BLEU score...")
    
    tokenized_hypotheses = [indic_tokenize_text(hyp) for hyp in hypotheses]
    tokenized_references = [[indic_tokenize_text(ref)] for ref in references]
    
    bleu = corpus_bleu(tokenized_hypotheses, tokenized_references, tokenize='none')
    bleu_score = bleu.score
    logger.info(f"BLEU Score: {bleu_score:.2f}")
    
    results_df = pd.DataFrame({
        "telugu_sentence": sources,
        "tamil_sentence": references,
        "tamil_hypothesis": hypotheses,
        "bleu_score": [bleu_score] * len(sources)
    })
    results_df.to_csv(BLEU_OUTPUT_PATH, index=False)
    logger.info(f"BLEU results saved to {BLEU_OUTPUT_PATH}")
    
    return bleu_score
    
# Compute chrF++ score
def compute_chrf(test_data, sources, references, hypotheses):
    logger.info("Computing chrF++ score...")
    
    refs_list = [[ref] for ref in references]
    
    chrf = corpus_chrf(hypotheses, refs_list, char_order=6, word_order=2, beta=2)
    chrf_score = chrf.score
    logger.info(f"chrF++ Score: {chrf_score:.2f}")
    
    results_df = pd.DataFrame({
        "telugu_sentence": sources,
        "tamil_sentence": references,
        "tamil_hypothesis": hypotheses,
        "chrf_score": [chrf_score] * len(sources)
    })
    results_df.to_csv(CHRF_OUTPUT_PATH, index=False)
    logger.info(f"chrF++ results saved to {CHRF_OUTPUT_PATH}")
    
    return chrf_score
    
# Compute TER score
def compute_ter(test_data, sources, references, hypotheses):
    logger.info("Computing TER score...")
    
    refs_list = [[ref] for ref in references]
    
    ter = corpus_ter(hypotheses, refs_list)
    ter_score = ter.score
    logger.info(f"TER Score: {ter_score:.2f}")
    
    results_df = pd.DataFrame({
        "telugu_sentence": sources,
        "tamil_sentence": references,
        "tamil_hypothesis": hypotheses,
        "ter_score": [ter_score] * len(sources)
    })
    results_df.to_csv(TER_OUTPUT_PATH, index=False)
    logger.info(f"TER results saved to {TER_OUTPUT_PATH}")
    
    return ter_score

# Compute BERTScore
def compute_bertscore(test_data, sources, references, hypotheses):
    logger.info("Computing BERTScore...")
    
    P, R, F1 = bert_score(
        hypotheses,
        references,
        lang="ta",
        model_type="bert-base-multilingual-cased",
        device=device,
        verbose=True
    )
    
    avg_f1 = F1.mean().item()
    logger.info(f"BERTScore F1: {avg_f1:.4f}")
    
    bert_f1_scores = [f1.item() for f1 in F1]
    
    results_df = pd.DataFrame({
        "telugu_sentence": sources,
        "tamil_sentence": references,
        "tamil_hypothesis": hypotheses,
        "bertscore_f1": bert_f1_scores
    })
    results_df.to_csv(BERTSCORE_OUTPUT_PATH, index=False)
    logger.info(f"BERTScore results saved to {BERTSCORE_OUTPUT_PATH}")
    
    return avg_f1

# Compute COMET score
def compute_comet(test_data, sources, references, hypotheses):
    if not comet_available:
        logger.warning("COMET not available. Skipping COMET evaluation.")
        return None
    
    logger.info("Computing COMET score...")
    
    model_path = download_model("Unbabel/wmt22-comet-da")
    model = load_from_checkpoint(model_path)
    model.to(device)
    
    data = []
    for src, hyp, ref in zip(sources, hypotheses, references):
        data.append({
            "src": src,
            "mt": hyp,
            "ref": ref
        })
    
    logger.info("Running COMET evaluation...")
    model_output = model.predict(data, batch_size=8, gpus=1 if device == "cuda" else 0)
    comet_scores = model_output.scores
    avg_comet = model_output.system_score
    
    logger.info(f"COMET Score: {avg_comet:.4f}")
    
    results_df = pd.DataFrame({
        "telugu_sentence": sources,
        "tamil_sentence": references,
        "tamil_hypothesis": hypotheses,
        "comet_score": comet_scores
    })
    results_df.to_csv(COMET_OUTPUT_PATH, index=False)
    logger.info(f"COMET results saved to {COMET_OUTPUT_PATH}")
    
    return avg_comet

# Main evaluation function
def evaluate_model():
    test_data, sources, references, hypotheses = generate_translations(test_dataset, NUM_SAMPLES)
    
    bleu_score = compute_bleu(test_data, sources, references, hypotheses)
    bertscore_f1 = compute_bertscore(test_data, sources, references, hypotheses)
    chrf_score = compute_chrf(test_data, sources, references, hypotheses)
    ter_score = compute_ter(test_data, sources, references, hypotheses)
    
    comet_score = None
    if comet_available:
        comet_score = compute_comet(test_data, sources, references, hypotheses)
    
    print("\n" + "="*50)
    print("EVALUATION SUMMARY")
    print("="*50)
    print(f"Number of samples: {len(sources)}")
    print(f"BLEU Score: {bleu_score:.2f}")
    print(f"chrF++ Score: {chrf_score:.2f}")
    print(f"TER Score: {ter_score:.2f} (lower is better)")
    print(f"BERTScore F1: {bertscore_f1:.4f}")
    if comet_score is not None:
        print(f"COMET Score: {comet_score:.4f}")
    print("="*50)
    
    test_input = "హాయ్, మీరు ఎలా ఉన్నారు?"  # "Hi, how are you?"
    translated_text = mbart_translate_text(test_input, debug=True)
    print(f"\nTest Translation:")
    print(f"Source (Telugu): {test_input}")
    print(f"Target (Tamil): {translated_text}")
    
    return {
        "bleu": bleu_score,
        "chrf": chrf_score,
        "ter": ter_score,
        "bertscore": bertscore_f1,
        "comet": comet_score
    }


if __name__ == "__main__":
    evaluate_model()

Using device: cuda
Loading model configuration...
Loading model and tokenizer...
Loading dataset...
Generating translations for 1260 samples


Dataset columns: ['English', 'Telugu', 'Tamil']
Dataset shape: (1263, 3)
First few rows:
                                             English  \
0  The two leaders also discussed global developm...   
1  On the occasion of Ambedkar Jayanti today, Pri...   
2  He said the Government is working with a diffe...   
3  He said the aim is to complete this task by 2022.   
4  He said these Health and Wellness Centres woul...   

                                              Telugu  \
0  ఉభ‌య నేత‌లు ప్ర‌పంచ అభివృద్ధి సంబంధిత ఆర్థిక స...   
1  నేడు ఆంబేడ్ కర్ జయంతి సందర్భంగా, ప్రధాన మంత్రి...   
2  ఈ 115 జిల్లాల విషయంలో ప్రభుత్వం ఒక వ్యత్యాసభరి...   
3  ఈ కార్యభారాన్ని 2022 కల్లా పూర్తి చేయాలన్నదే ల...   
4  ఈ హెల్త్ అండ్ వెల్ నెస్ సెంటర్ లు పేదలకు ఒక కు...   

                                               Tamil  
0  உலகளாவிய மேம்பாட்டுக்கான நிதி உள்ளிட்டவை குறித...  
1  அம்பேத்கர் பிறந்த தினமான இன்று, மத்திய அரசின் ...  
2  இந்த 115 மாவட்டங்கள் மீதும் மாறுபட்ட கண்ணோட்டத...  
3  இந்தப் பணியை 2

Generating translations: 100%|██████████████| 1260/1260 [10:51<00:00,  1.93it/s]
Computing BLEU score...
BLEU Score: 39.13
BLEU results saved to ./mBART_Reverse_RESULTS/bleu_evaluation_results.csv
Computing BERTScore...


calculating scores...
computing bert embedding.


  0%|          | 0/40 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/20 [00:00<?, ?it/s]

BERTScore F1: 0.8496
BERTScore results saved to ./mBART_Reverse_RESULTS/bertscore_evaluation_results.csv
Computing chrF++ score...


done in 4.93 seconds, 255.68 sentences/sec


chrF++ Score: 42.48
chrF++ results saved to ./mBART_Reverse_RESULTS/chrf_evaluation_results.csv
Computing TER score...
TER Score: 81.08
TER results saved to ./mBART_Reverse_RESULTS/ter_evaluation_results.csv
Computing COMET score...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
Running COMET evaluation...
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelis


EVALUATION SUMMARY
Number of samples: 1260
BLEU Score: 39.13
chrF++ Score: 42.48
TER Score: 81.08 (lower is better)
BERTScore F1: 0.8496
COMET Score: 0.9203


Raw Output IDs: [2, 250044, 39507, 37961, 4, 19238, 29947, 128251, 37961, 32, 2]
Decoded with special tokens: </s>ta_IN ஹாய், நீ எப்படி இருக்கிறாய்?</s>



Test Translation:
Source (Telugu): హాయ్, మీరు ఎలా ఉన్నారు?
Target (Tamil): ஹாய், நீ எப்படி இருக்கிறாய்?
