#Model Training

In [None]:

import pandas as pd
import torch
import transformers
from datasets import Dataset, DatasetDict
from transformers import MBart50TokenizerFast, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import logging
import csv
import os
from datetime import datetime
import matplotlib.pyplot as plt


device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


log_file = os.path.join("./mbart_finetuned_paragraphs", "training_logs.csv")
os.makedirs("./mbart_finetuned_paragraphs", exist_ok=True)

with open(log_file, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["epoch", "step", "training_loss", "validation_loss", "learning_rate", "timestamp"])


class CustomLoggingCallback(transformers.TrainerCallback):
    def __init__(self, log_file):
        self.log_file = log_file

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is None:
            return
        epoch = state.epoch
        step = state.global_step
        training_loss = logs.get("loss", None)
        validation_loss = logs.get("eval_loss", None)
        learning_rate = logs.get("learning_rate", None)
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        with open(self.log_file, mode='a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([epoch, step, training_loss, validation_loss, learning_rate, timestamp])

# Load paragraph dataset
df = pd.read_csv('./Paragraphs.csv') 
df1 = df.copy()


print("DataFrame shape:", df1.shape)
print("Sample data:\n", df1.head(5))


df1 = df1.rename(columns={"Tamil": "ta", "Telugu": "te"})


dataset = Dataset.from_pandas(df1)
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
split_dataset = DatasetDict({"train": split_dataset["train"], "test": split_dataset["test"]})
print("Train dataset size:", len(split_dataset["train"]))
print("Test dataset size:", len(split_dataset["test"]))

# Load previously fine-tuned mBART model and tokenizer
MBART_MODEL_PATH = "./mbart_finetuned3"
mbart_tokenizer = MBart50TokenizerFast.from_pretrained(MBART_MODEL_PATH, src_lang="ta_IN", tgt_lang="te_IN")
mbart_model = AutoModelForSeq2SeqLM.from_pretrained(MBART_MODEL_PATH).to(device)


mbart_vocab_size_tokenizer = len(mbart_tokenizer)
mbart_vocab_size_model = mbart_model.get_output_embeddings().weight.size(0)
print("mBART - Initial tokenizer vocab size:", mbart_vocab_size_tokenizer)
print("mBART - Initial model output vocab size:", mbart_vocab_size_model)


if mbart_vocab_size_tokenizer != mbart_vocab_size_model:
    print(f"Warning: mBART vocab size mismatch (Tokenizer: {mbart_vocab_size_tokenizer}, Model: {mbart_vocab_size_model}). Adjusting model embeddings.")
    mbart_model.resize_token_embeddings(mbart_vocab_size_tokenizer)
    print("Post-resize model vocab size:", mbart_model.get_output_embeddings().weight.size(0))
else:
    print("mBART - Vocab sizes match, no adjustment needed.")

# Preprocessing function for paragraphs
def mbart_preprocess_function(examples):
    inputs = [ta_text for ta_text in examples["ta"]]
    targets = [te_text for te_text in examples["te"]]
    model_inputs = mbart_tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with mbart_tokenizer.as_target_tokenizer():
        labels = mbart_tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids
    labels = [[-100 if token == mbart_tokenizer.pad_token_id else token for token in seq] for seq in labels]
    model_inputs["labels"] = labels
    return model_inputs

# Apply preprocessing
mbart_tokenized_datasets = split_dataset.map(
    mbart_preprocess_function,
    batched=True,
    batch_size=500,
    remove_columns=["ta", "te"]
)
print("mBART - Tokenized train sample:", mbart_tokenized_datasets["train"][0])

# Training arguments for further fine-tuning
mbart_training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart_finetuned_paragraphs",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    report_to="none",
    push_to_hub=False,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    logging_steps=100,
    save_steps=1000
)


mbart_data_collator = DataCollatorForSeq2Seq(mbart_tokenizer, model=mbart_model)
mbart_trainer = Seq2SeqTrainer(
    model=mbart_model,
    args=mbart_training_args,
    train_dataset=mbart_tokenized_datasets["train"],
    eval_dataset=mbart_tokenized_datasets["test"],
    tokenizer=mbart_tokenizer,
    data_collator=mbart_data_collator,
    callbacks=[CustomLoggingCallback(log_file)]
)

# Train
print("Further fine-tuning mBART on paragraph data...")
mbart_trainer.train()


mbart_trainer.save_model("./mbart_finetuned_paragraphs")
mbart_tokenizer.save_pretrained("./mbart_finetuned_paragraphs")


mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained("./mbart_finetuned_paragraphs").to(device)
mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained("./mbart_finetuned_paragraphs", src_lang="ta_IN", tgt_lang="te_IN")
print("mBART - Saved tokenizer vocab size:", len(mbart_saved_tokenizer))
print("mBART - Saved model output vocab size:", mbart_saved_model.get_output_embeddings().weight.size(0))

# Test translation with debugging
def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
    if debug:
        print("Tokenized Input IDs:", inputs["input_ids"].tolist())
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=1024,
        min_length=50,
        num_beams=5,
        early_stopping=False,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["te_IN"]
    )
    if debug:
        print("Raw Output IDs:", outputs[0].tolist())
        print("Decoded with special tokens:", mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False))
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()

# Test with provided sample paragraph
input_text = """ஒவ்வொரு மனிதரும், ஒவ்வொரு வர்த்தகமும், ஏதாவது சிறப்பானவற்றை செய்வதற்கு, வரும் ஆண்டில் மேம்பாட்டிற்கு புத்தாண்டில் தீர்மானம் எடுத்துக் கொள்கிறது. பிரதமர் நரேந்திர மோடி 2021-ம் ஆண்டில் கடைசி மனதின் குரல் மூலம் பொதுமக்களுடன் உரையாடினார். தனிநபர்களின் நற்குணங்களை எடுத்துரைப்பதுடன், சமுதாயம் மற்றும் நாட்டு மக்களிடையே சிறப்பாக செயல்பட கடந்த ஏழு ஆண்டுகளாக அவரின் இந்த பயணம் எவ்வாறு ஊக்கமளிக்கிறது என்பதை அவர் எடுத்துக் கூறினார். மக்கள் சக்திக்கான கருவியாக இந்த தளம் உருவாகி உள்ளது. கடந்த ஆண்டின் அவருடைய கடைசி மனதின் குரலில் ஆசாதிகா அமிர்த மகோத்சவம், இந்திய கலாச்சாரம், தூய்மை, ஒருவரது வாழ்க்கையில் இலக்கியத்தின் மதிப்பு, பெரிய கனவு கண்டு, அந்த கனவுகளை நனவாக்க, உழைப்பதன் முக்கியத்துவம் குறித்து பிரதமர் பேசினார்."""
translated_text = mbart_translate_text(input_text, debug=True)
print("mBART Translation:", translated_text)


def plot_training_logs(log_file):
    logs = pd.read_csv(log_file)
    train_logs = logs[logs['training_loss'].notnull()]
    valid_logs = logs[logs['validation_loss'].notnull()]
    
    plt.figure(figsize=(10, 6))
    if not train_logs.empty:
        plt.plot(train_logs['step'], train_logs['training_loss'], label='Training Loss', marker='o')
    if not valid_logs.empty:
        plt.plot(valid_logs['step'], valid_logs['validation_loss'], label='Validation Loss', marker='s')
    
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss Over Time (Paragraph Fine-Tuning)')
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join("./mbart_finetuned_paragraphs", "training_loss_plot.png"))
    plt.close()

# Generate plot
plot_training_logs(log_file)
print("Training log plot saved as 'training_loss_plot.png' in the output directory.")


Using device: cuda
DataFrame shape: (2856, 21)
Sample data:
    Index                                            English  English Tokens  \
0      0  Every individual, every business, sets a New Y...             145   
1      1  COVID AND MANPOWER: It is the power of manpowe...              71   
2      2  CAUTION: It's also worth noting that a new Cor...              70   
3      3  GROUP CAPTAIN VARUN SINGH: In the recent trage...             111   
4      4  DISCUSSION ON EXAMS: Every year, I discuss top...              68   

   English Sentences  English Words  \
0                  4             91   
1                  2             43   
2                  3             45   
3                  5             66   
4                  3             45   

                                             Kannada  Kannada Tokens  \
0  ಪ್ರತಿ ವ್ಯಕ್ತಿಯೂ, ಪ್ರತಿ ವೃತ್ತಿಯೂ, ಮುಂದಿನ ವರ್ಷದಲ...             145   
1  ಕೋವಿಡ್ ಮತ್ತು ಮಾನವಶಕ್ತಿ: ಭಾರತವು 100 ವರ್ಷಗಳಲ್ಲಿಯ...              67   
2  ಎಚ್ಚರಿಕೆ: 

Map:   0%|          | 0/2570 [00:00<?, ? examples/s]



Map:   0%|          | 0/286 [00:00<?, ? examples/s]

mBART - Tokenized train sample: {'Index': 2716, 'English': 'By adopting the drip irrigation method, about 74 million kWh of energy was saved in a year. Interlinking of 13 rivers and water transfer in Sardar Sarovar Canal Project, continuous monitoring of Sardar Sarovar Project accelerated the implementation. Due to this, where in 2000-01, 45.12% and 28.59% of electricity were consumed in the agriculture and industrial sector respectively, it changed to 21.10% in the agriculture sector and 35.26 percent in the industrial sector in 2008-09 . These savings were due to the lesser use of running a motor to extract groundwater. Also, less pumping was required due to the rise in water level. This saving was equivalent to 15,459 million tonnes of carbon emissions.', 'English Tokens': 184, 'English Sentences': 6, 'English Words': 114, 'Kannada': 'ಹನಿ ನೀರಾವರಿ ವಿಧಾನವನ್ನು ಅಳವಡಿಸಿಕೊಳ್ಳುವ ಮೂಲಕ, ಒಂದು ವರ್ಷದಲ್ಲಿ ಸುಮಾರು 74 ಮಿಲಿಯನ್ ಕಿವ್ಯಾ. ವಿದ್ಯುತ್ ಉಳಿಸಲಾಗಿದೆ. 13 ನದಿಗಳ ಜೋಡಣೆ ಮತ್ತು ಸರ್ದಾರ್ ಸರೋವರ ಕಾಲುವೆ ಯೋ

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  mbart_trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,2.0957,1.773528
2,1.9791,1.751427
3,1.9127,1.748207


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


mBART - Saved tokenizer vocab size: 250054
mBART - Saved model output vocab size: 250054
Tokenized Input IDs: [[250044, 93860, 69535, 56150, 4, 93860, 209542, 26656, 4, 214978, 199044, 26437, 124002, 108048, 81392, 4, 35235, 120785, 8938, 35576, 938, 175309, 79209, 15413, 51153, 42665, 3769, 174611, 938, 203312, 84194, 64875, 5, 118511, 10860, 4551, 214771, 113627, 64371, 9, 938, 120785, 8938, 129582, 42756, 5944, 2913, 173039, 58540, 227228, 48068, 130110, 6490, 7083, 91677, 5, 59386, 17056, 69179, 17289, 10860, 175331, 13764, 24395, 88310, 12309, 81069, 54174, 4, 241019, 938, 13128, 6, 83862, 151640, 227778, 224066, 228591, 49422, 15377, 24183, 32105, 91585, 2690, 133523, 5894, 144678, 12095, 25683, 9313, 11830, 45237, 2798, 4548, 14622, 152711, 67442, 22173, 203312, 173812, 5, 40252, 139864, 64804, 39311, 6736, 11586, 5894, 86604, 938, 221659, 5414, 49604, 5, 49422, 120785, 18806, 238349, 129582, 42756, 5944, 2913, 14233, 4551, 37368, 6001, 22070, 5944, 10753, 2690, 11449, 2912, 487

#Testing

In [None]:

import torch
from transformers import MBart50TokenizerFast, AutoModelForSeq2SeqLM
from IPython.display import display, HTML

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the fine-tuned model and tokenizer
MODEL_PATH = "./mbart_finetuned_paragraphs"  
mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH, src_lang="ta_IN", tgt_lang="te_IN")

# Translation function for paragraphs
def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
    if debug:
        print("Tokenized Input IDs:", inputs["input_ids"].tolist())
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=1024,
        min_length=50,
        num_beams=5,
        early_stopping=False,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["te_IN"]
    )
    if debug:
        print("Raw Output IDs:", outputs[0].tolist())
        print("Decoded with special tokens:", mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False))
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()


def translate_interactively():
    display(HTML("<h3>Tamil to Telugu Paragraph Translator</h3>"))
    print("Enter Tamil paragraph below to translate to Telugu (type 'exit' to stop).")
    print("Tip: Paste multi-line paragraphs and press Enter twice to submit.")
    
    while True:
       
        lines = []
        print("Tamil Paragraph Input (press Enter twice to submit):")
        while True:
            line = input()
            if line == "":
                if lines:  
                    break
                else: 
                    continue
            lines.append(line)
        
       
        user_input = " ".join(lines).strip()
        
        #  exit condition
        if user_input.lower() == "exit":
            display(HTML("<p style='color: green;'>Exiting translator...</p>"))
            break
        
        if not user_input:
            display(HTML("<p style='color: red;'>Please enter some text.</p>"))
            continue
        
        
        try:
            translated_text = mbart_translate_text(user_input, debug=False)  
            display(HTML(f"<p><b>Tamil Paragraph:</b><br>{user_input.replace(' ', '&nbsp;').replace('\n', '<br>')}<br><br><b>Telugu Translation:</b><br>{translated_text.replace(' ', '&nbsp;').replace('\n', '<br>')}</p>"))
        except Exception as e:
            display(HTML(f"<p style='color: red;'>Error during translation: {e}</p>"))


translate_interactively()


Using device: cuda


2025-05-10 14:39:07.544498: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 14:39:07.556226: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746868147.569925  364523 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746868147.574246  364523 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746868147.584651  364523 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Enter Tamil paragraph below to translate to Telugu (type 'exit' to stop).
Tip: Paste multi-line paragraphs and press Enter twice to submit.
Tamil Paragraph Input (press Enter twice to submit):


KeyboardInterrupt: Interrupted by user

 exit


#Evaluation Metrics

In [None]:
# Install required dependencies

!pip install -q torch==2.3.1 torchvision==0.18.1
!pip install -q transformers==4.41.2 datasets==2.20.0
!pip install -q sacrebleu==2.3.1 pandas==2.2.2 numpy==1.25.2 tqdm==4.66.4
!pip install -q bert-score==0.3.13
!pip install -q protobuf==3.20.3 
!pip install -q indic-nlp-library



# Combines BLEU, BERTScore, COMET, chrF++ and TER evaluation metrics
import os
import numpy as np
import pandas as pd
import torch
import logging
from tqdm import tqdm
from datasets import Dataset
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration, AutoConfig, AutoModelForSeq2SeqLM
from sacrebleu import corpus_bleu, corpus_chrf, corpus_ter
from indicnlp.tokenize import indic_tokenize
from bert_score import score as bert_score


try:
    from comet import download_model, load_from_checkpoint
    comet_available = True
except ImportError:
    comet_available = False
    print("COMET not available. Will skip COMET evaluation.")


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")


MODEL_PATH = "./mbart_finetuned_paragraphs"  
DATASET_PATH = "./para/testing.csv"  
NUM_SAMPLES = 45  

# Output paths
os.makedirs("./para_results", exist_ok=True)
BLEU_OUTPUT_PATH = "./para_results/bleu_evaluation_results.csv"
BERTSCORE_OUTPUT_PATH = "./para_results/bertscore_evaluation_results.csv" 
COMET_OUTPUT_PATH = "./para_results/comet_evaluation_results.csv"
CHRF_OUTPUT_PATH = "./para_results/chrf_evaluation_results.csv"
TER_OUTPUT_PATH = "./para_results/ter_evaluation_results.csv"

# Load the model configuration first
logger.info("Loading model configuration...")
config = AutoConfig.from_pretrained(MODEL_PATH)
if hasattr(config, 'generation_config'):
    if config.generation_config.early_stopping is None:
        config.generation_config.early_stopping = True
else:
    config.early_stopping = True


logger.info("Loading model and tokenizer...")
try:
    mbart_saved_model = MBartForConditionalGeneration.from_pretrained(
        MODEL_PATH,
        config=config
    ).to(device)
except:
    mbart_saved_model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_PATH,
        config=config
    ).to(device)


if hasattr(mbart_saved_model, 'generation_config'):
    mbart_saved_model.generation_config.early_stopping = True

mbart_saved_tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_PATH, src_lang="ta_IN", tgt_lang="te_IN")

# Load the test dataset
logger.info("Loading dataset...")
df = pd.read_csv(DATASET_PATH)
print(f"Dataset columns: {df.columns.tolist()}")
print(f"Dataset shape: {df.shape}")
print(f"First few rows:\n{df.head()}")


tamil_col = None
telugu_col = None

# Common column name patterns to check
tamil_patterns = ['tamil_sentence', 'tamil', 'source', 'src', 'Tamil', 'tamil_text']
telugu_patterns = ['telugu_sentence', 'telugu', 'target', 'tgt', 'Telugu', 'telugu_text']

for col in df.columns:
    if any(pattern.lower() in col.lower() for pattern in tamil_patterns):
        tamil_col = col
    if any(pattern.lower() in col.lower() for pattern in telugu_patterns):
        telugu_col = col

if tamil_col is None or telugu_col is None:
    raise ValueError(f"Could not identify Tamil and Telugu columns. Available columns: {df.columns.tolist()}")

print(f"Using Tamil column: {tamil_col}")
print(f"Using Telugu column: {telugu_col}")


df = df[[tamil_col, telugu_col]].dropna()

df = df.rename(columns={tamil_col: 'tamil_sentence', telugu_col: 'telugu_sentence'})
test_dataset = Dataset.from_pandas(df)
print(f"Test dataset size: {len(test_dataset)}")

def indic_tokenize_text(text):
    if not text or pd.isna(text):
        return ""
    return ' '.join(indic_tokenize.trivial_tokenize(text, lang='te'))

# Translation function for paragraphs
def mbart_translate_text(input_text, debug=False):
    inputs = mbart_saved_tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
    if debug:
        logger.info(f"Tokenized Input IDs: {inputs['input_ids'].tolist()}")
    outputs = mbart_saved_model.generate(
        **inputs,
        max_length=1024,
        min_length=50,
        num_beams=5,
        early_stopping=True,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        forced_bos_token_id=mbart_saved_tokenizer.lang_code_to_id["te_IN"]
    )
    if debug:
        logger.info(f"Raw Output IDs: {outputs[0].tolist()}")
        logger.info(f"Decoded with special tokens: {mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=False)}")
    decoded_output = mbart_saved_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded_output.strip()


def generate_translations(dataset, num_samples=NUM_SAMPLES):
    sources = []
    references = []
    hypotheses = []
    
    
    test_data = dataset.select(range(min(num_samples, len(dataset))))
    logger.info(f"Generating translations for {len(test_data)} samples")
    
    for example in tqdm(test_data, desc="Generating translations"):
        input_text = example["tamil_sentence"]
        reference = example["telugu_sentence"]
        
        try:
            hypothesis = mbart_translate_text(input_text, debug=False)
        except Exception as e:
            logger.warning(f"Error translating '{input_text[:50]}...': {e}")
            hypothesis = ""
            
        sources.append(input_text)
        references.append(reference)
        hypotheses.append(hypothesis)
    
    return test_data, sources, references, hypotheses

# Compute BLEU score
def compute_bleu(test_data, sources, references, hypotheses):
    logger.info("Computing BLEU score...")
    
    # Tokenize for BLEU calculation
    tokenized_hypotheses = [indic_tokenize_text(hyp) for hyp in hypotheses]
    tokenized_references = [[indic_tokenize_text(ref)] for ref in references]
    
    # Compute SacreBLEU score
    bleu = corpus_bleu(tokenized_hypotheses, tokenized_references, tokenize='none')
    bleu_score = bleu.score
    logger.info(f"BLEU Score: {bleu_score:.2f}")
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "bleu_score": [bleu_score] * len(sources)
    })
    results_df.to_csv(BLEU_OUTPUT_PATH, index=False)
    logger.info(f"BLEU results saved to {BLEU_OUTPUT_PATH}")
    
    return bleu_score
    
# Compute chrF++ score
def compute_chrf(test_data, sources, references, hypotheses):
    logger.info("Computing chrF++ score...")
    
    # Prepare references format for chrF++
    refs_list = [[ref] for ref in references]
    
    # Compute chrF++ score
    chrf = corpus_chrf(hypotheses, refs_list, char_order=6, word_order=2, beta=2)
    chrf_score = chrf.score
    logger.info(f"chrF++ Score: {chrf_score:.2f}")
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "chrf_score": [chrf_score] * len(sources)
    })
    results_df.to_csv(CHRF_OUTPUT_PATH, index=False)
    logger.info(f"chrF++ results saved to {CHRF_OUTPUT_PATH}")
    
    return chrf_score
    
# Compute TER score (Translation Edit Rate)
def compute_ter(test_data, sources, references, hypotheses):
    logger.info("Computing TER score...")
    
    # Prepare references format for TER
    refs_list = [[ref] for ref in references]
    
    # Compute TER score
    ter = corpus_ter(hypotheses, refs_list)
    ter_score = ter.score
    logger.info(f"TER Score: {ter_score:.2f}")
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "ter_score": [ter_score] * len(sources)
    })
    results_df.to_csv(TER_OUTPUT_PATH, index=False)
    logger.info(f"TER results saved to {TER_OUTPUT_PATH}")
    
    return ter_score

# Compute BERTScore
def compute_bertscore(test_data, sources, references, hypotheses):
    logger.info("Computing BERTScore...")
    
    # Compute BERTScore
    P, R, F1 = bert_score(
        hypotheses,
        references,
        lang="te",
        model_type="bert-base-multilingual-cased",
        device=device,
        verbose=True
    )
    
    # Average F1 score
    avg_f1 = F1.mean().item()
    logger.info(f"BERTScore F1: {avg_f1:.4f}")
    
    # Store per-sentence F1 scores
    bert_f1_scores = [f1.item() for f1 in F1]
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "bertscore_f1": bert_f1_scores
    })
    results_df.to_csv(BERTSCORE_OUTPUT_PATH, index=False)
    logger.info(f"BERTScore results saved to {BERTSCORE_OUTPUT_PATH}")
    
    return avg_f1

# Compute COMET score
def compute_comet(test_data, sources, references, hypotheses):
    if not comet_available:
        logger.warning("COMET not available. Skipping COMET evaluation.")
        return None
    
    logger.info("Computing COMET score...")
    
    # Load COMET model
    logger.info("Downloading COMET model...")
    model_path = download_model("Unbabel/wmt22-comet-da")
    model = load_from_checkpoint(model_path)
    model.to(device)
    
    # Prepare data for COMET
    data = []
    for src, hyp, ref in zip(sources, hypotheses, references):
        data.append({
            "src": src,
            "mt": hyp,
            "ref": ref
        })
    
    # Compute scores
    logger.info("Running COMET evaluation...")
    model_output = model.predict(data, batch_size=8, gpus=1 if device == "cuda" else 0)
    comet_scores = model_output.scores
    avg_comet = model_output.system_score
    
    logger.info(f"COMET Score: {avg_comet:.4f}")
    
    # Save results
    results_df = pd.DataFrame({
        "tamil_sentence": sources,
        "telugu_sentence": references,
        "telugu_hypothesis": hypotheses,
        "comet_score": comet_scores
    })
    results_df.to_csv(COMET_OUTPUT_PATH, index=False)
    logger.info(f"COMET results saved to {COMET_OUTPUT_PATH}")
    
    return avg_comet

# Main evaluation function
def evaluate_model():
    # Generate translations
    test_data, sources, references, hypotheses = generate_translations(test_dataset, NUM_SAMPLES)
    
    # Compute metrics
    bleu_score = compute_bleu(test_data, sources, references, hypotheses)
    bertscore_f1 = compute_bertscore(test_data, sources, references, hypotheses)
    chrf_score = compute_chrf(test_data, sources, references, hypotheses)
    ter_score = compute_ter(test_data, sources, references, hypotheses)
    
    comet_score = None
    if comet_available:
        comet_score = compute_comet(test_data, sources, references, hypotheses)
    
    # Print summary
    print("\n" + "="*50)
    print("PARAGRAPH EVALUATION SUMMARY")
    print("="*50)
    print(f"Number of samples: {len(sources)}")
    print(f"BLEU Score: {bleu_score:.2f}")
    print(f"chrF++ Score: {chrf_score:.2f}")
    print(f"TER Score: {ter_score:.2f} (lower is better)")
    print(f"BERTScore F1: {bertscore_f1:.4f}")
    if comet_score is not None:
        print(f"COMET Score: {comet_score:.4f}")
    print("="*50)
    
    # Test a single paragraph translation
    test_input = """ஒவ்வொரு மனிதரும், ஒவ்வொரு வர்த்தகமும், ஏதாவது சிறப்பானவற்றை செய்வதற்கு, வரும் ஆண்டில் மேம்பாட்டிற்கு புத்தாண்டில் தீர்மானம் எடுத்துக் கொள்கிறது. பிரதமர் நரேந்திர மோடி 2021-ம் ஆண்டில் கடைசி மனதின் குரல் மூலம் பொதுமக்களுடன் உரையாடினார். தனிநபர்களின் நற்குணங்களை எடுத்துரைப்பதுடன், சமுதாயம் மற்றும் நாட்டு மக்களிடையே சிறப்பாக செயல்பட கடந்த ஏழு ஆண்டுகளாக அவரின் இந்த பயணம் எவ்வாறு ஊக்கமளிக்கிறது என்பதை அவர் எடுத்துக் கூறினார்."""
    translated_text = mbart_translate_text(test_input, debug=True)
    print(f"\nTest Paragraph Translation:")
    print(f"Source (Tamil):\n{test_input}")
    print(f"Target (Telugu):\n{translated_text}")
    
    return {
        "bleu": bleu_score,
        "chrf": chrf_score,
        "ter": ter_score,
        "bertscore": bertscore_f1,
        "comet": comet_score
    }

# Run the evaluation
evaluate_model()


    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
[0m  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[33 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 389, in <module>
  [31m   [0m     main()
  [31m   [0m   File "/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py", line 373, in main
  [31m   [0m     json_out["return_val"] = hook(**hook_input["kwargs"])
  [31m   [0m                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  [31m   [0m   File "/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process

Using device: cuda
Loading model configuration...
Loading model and tokenizer...
Loading dataset...
Generating translations for 45 samples


Dataset columns: ['Tamil', 'Telugu']
Dataset shape: (50, 2)
First few rows:
                                               Tamil  \
0  பெருந்தொற்று தீவிர வடிவம் பெறுவதை உணர்ந்துகொண்...   
1  நான் தமிழகத்தைச் சேர்ந்த ஆனந்த். அரசு வேலைக்கா...   
2  "இந்திய இளைஞர்கள் நேர்மறையானவர்களாகவும், நடைமு...   
3  பிரதமர் மோடி: நேற்று முன்தினம் வரை, அதாவது, ஜூ...   
4  நமது சுதந்திரத்தின் 75-வது ஆண்டை இந்த ஆண்டு கொ...   

                                              Telugu  
0  ఈ మహమ్మారి తిరగబెడితే చాలా ప్రాణాంతకంగా ఉంటుంద...  
1  నేను తమిళనాడుకి చెందిన ఆనంద్ని. మేము ప్రభుత్వ ...  
2  భారతదేశ యువత చాలా సానుకూలమైన దృక్పథంలో, ఆచరణాత...  
3  ప్రధాన మంత్రి మోదీ: జూన్ 1కి ముందు వరకు, అందరూ...  
4  ఈ ఏడాది 75వ స్వాతంత్య్ర దినోత్సవం జరగబోతుంది. ...  
Using Tamil column: Tamil
Using Telugu column: Telugu
Test dataset size: 50


Generating translations: 100%|██████████████████| 45/45 [00:50<00:00,  1.12s/it]
Computing BLEU score...
BLEU Score: 19.41
BLEU results saved to ./para_results/bleu_evaluation_results.csv
Computing BERTScore...


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

BERTScore F1: 0.8081
BERTScore results saved to ./para_results/bertscore_evaluation_results.csv
Computing chrF++ score...
chrF++ Score: 41.34
chrF++ results saved to ./para_results/chrf_evaluation_results.csv
Computing TER score...


done in 0.42 seconds, 107.49 sentences/sec


TER Score: 82.13
TER results saved to ./para_results/ter_evaluation_results.csv
Computing COMET score...
Downloading COMET model...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
Encoder model frozen.
/home/mca/anaconda3/envs/nmt/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
Running COMET evaluation...
You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'hig


PARAGRAPH EVALUATION SUMMARY
Number of samples: 45
BLEU Score: 19.41
chrF++ Score: 41.34
TER Score: 82.13 (lower is better)
BERTScore F1: 0.8081
COMET Score: 0.8580


Raw Output IDs: [2, 250045, 22883, 71259, 4, 22883, 113385, 1092, 116044, 5738, 91641, 5635, 6258, 13973, 73677, 18089, 159858, 1296, 100147, 13367, 93683, 2645, 69273, 1092, 142635, 5, 52844, 218972, 211333, 161920, 64371, 1296, 69502, 3520, 99238, 14839, 129461, 8729, 12217, 1886, 29446, 19203, 46773, 17161, 35396, 5, 80833, 192055, 132095, 93090, 10675, 162136, 4276, 227349, 6258, 4, 52994, 69502, 2078, 40803, 9573, 3227, 16783, 4918, 46551, 73699, 31260, 9573, 33913, 2502, 35396, 5, 2]
Decoded with special tokens: </s>te_IN ప్రతి వ్యక్తి, ప్రతి వ్యాపారం ఏదో ఒక గొప్పదాన్ని చేయటానికి కొత్త సంవత్సరంలో అభివృద్ధి కోసం తీర్మానం చేస్తుంది. ప్రధానమంత్రి నరేంద్ర మోదీ 2021లో ప్రజలతో చివరి మనస్సు వాయిస్ ద్వారా సంభాషించారు. గత ఏడు సంవత్సరాలుగా తన ప్రయాణాలు సమాజాన్ని, దేశ ప్రజలను ఎంత ప్రేరేపించి చేశాయో ప్రస్తావించారు.</s>



Test Paragraph Translation:
Source (Tamil):
ஒவ்வொரு மனிதரும், ஒவ்வொரு வர்த்தகமும், ஏதாவது சிறப்பானவற்றை செய்வதற்கு, வரும் ஆண்டில் மேம்பாட்டிற்கு புத்தாண்டில் தீர்மானம் எடுத்துக் கொள்கிறது. பிரதமர் நரேந்திர மோடி 2021-ம் ஆண்டில் கடைசி மனதின் குரல் மூலம் பொதுமக்களுடன் உரையாடினார். தனிநபர்களின் நற்குணங்களை எடுத்துரைப்பதுடன், சமுதாயம் மற்றும் நாட்டு மக்களிடையே சிறப்பாக செயல்பட கடந்த ஏழு ஆண்டுகளாக அவரின் இந்த பயணம் எவ்வாறு ஊக்கமளிக்கிறது என்பதை அவர் எடுத்துக் கூறினார்.
Target (Telugu):
ప్రతి వ్యక్తి, ప్రతి వ్యాపారం ఏదో ఒక గొప్పదాన్ని చేయటానికి కొత్త సంవత్సరంలో అభివృద్ధి కోసం తీర్మానం చేస్తుంది. ప్రధానమంత్రి నరేంద్ర మోదీ 2021లో ప్రజలతో చివరి మనస్సు వాయిస్ ద్వారా సంభాషించారు. గత ఏడు సంవత్సరాలుగా తన ప్రయాణాలు సమాజాన్ని, దేశ ప్రజలను ఎంత ప్రేరేపించి చేశాయో ప్రస్తావించారు.


{'bleu': 19.412822908062317,
 'chrf': 41.335181730223454,
 'ter': 82.13073005093379,
 'bertscore': 0.8081279993057251,
 'comet': 0.858004789882236}