In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.meteor_score import meteor_score

In [2]:
import re
from sklearn.model_selection import train_test_split

In [3]:
english_scripts_file_path = "../datasets/Samantar dataset/final_data/en-ta/train.en"
native_scripts_file_path = "../datasets/Samantar dataset/final_data/en-ta/train.ta"

In [4]:
native_lines = []
with open(native_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        native_lines.append(line)

english_lines = []
with open(english_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        english_lines.append(line)

In [5]:
df = pd.DataFrame({'Tamil': native_lines, 'English': english_lines})
print(df.head())

                                               Tamil  \
0                  என்றுதான் நான் சொல்ல வருகிறேன்.\n   
1               ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.\n   
2  பல வருடங்களாக அவர் அந்த நித்திய எரிநரக தண்டனைய...   
3  அவர் நிதி அமைச்சர் அருண்ஜேட்லியின் முயற்சியை த...   
4  சில கலை வரலாற்றாசிரியர்கள் அவர் ஒரு வருடத்திற்...   

                                             English  
0                         That's what I am saying.\n  
1                   Every tournament is difficult.\n  
2  One of the first questions Flavio posed was, D...  
3  He gave full credit to the Union Finance Minis...  
4  Some art historians have suggested that he onl...  


In [6]:
test_set = df.sample(n = 1000, random_state = 1234)
print(f"Test set size: {len(test_set)}")
print(f"Dev set size: {len(test_set)}")

Test set size: 1000
Dev set size: 1000


In [7]:
native_tamil_sentences_test_set = test_set['Tamil'].values
print(type(native_tamil_sentences_test_set))
print(native_tamil_sentences_test_set.shape)
english_sentences_test_set = test_set['English'].values

<class 'numpy.ndarray'>
(1000,)


In [8]:
def calculate_scores(english_sentence, translation):
    reference = [english_sentence.split()]
    transformed_tokens = translation.split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    return bleu_score, gleu_score, met_score

In [9]:
import torch
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor

# Initialize metrics
indict2_bleu_scores = []
indict2_gleu_scores = []
indict2_meteor_scores = []

# Model and tokenizer setup
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "tam_Taml", "eng_Latn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Convert numpy array to a list
native_tamil_sentences = native_tamil_sentences_test_set.tolist()
english_sentences = english_sentences_test_set.tolist()

# Ensure the datasets are aligned
assert len(native_tamil_sentences) == len(english_sentences), "Dev sets must be of equal length."

for i, tamil_text in enumerate(native_tamil_sentences):
    try:
        # print(f"Processing sentence {i+1}...")
        
        # Preprocessing
        batch = ip.preprocess_batch(
            [tamil_text],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )
        # print(f"Preprocessed batch: {batch}")
        
        # Tokenize the sentences and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=1,
                num_return_sequences=1,
            )
        # print(f"Generated tokens: {generated_tokens}")

        # Decode the generated tokens into text
        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
        translation = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0]
        bleu_score, gleu_score, met_score = calculate_scores(english_sentences[i], translation)

        indict2_bleu_scores.append(bleu_score)
        indict2_gleu_scores.append(gleu_score)
        indict2_meteor_scores.append(met_score)

        # Log progress every 10 sentences
        if i % 10 == 0:
            print("Translating sentence", i + 1)
            print("Original Tamil text:", tamil_text)
            print("Original English Conversion:", english_sentences[i])
            print("Translated to English:", translation)
            print("Average BLEU score:", np.mean(indict2_bleu_scores))
            print("Average GLEU score:", np.mean(indict2_gleu_scores))
            print("Average METEOR score:", np.mean(indict2_meteor_scores))
            print()

    except Exception as e:
        print(f"Error processing sentence {i+1}: {e}")
        continue


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence 1
Original Tamil text: ஹிஜாப் அணிந்து வந்த பெண்ணை வெளியேற்றிய அமெரிக்க வங்கி! | Woman thrown out of US bank for wearing hijab

Original English Conversion: Woman kicked out of bank for wearing hijab

Translated to English: US bank throws woman out for wearing hijab
Average BLEU score: 5.072841446586652e-78
Average GLEU score: 0.3076923076923077
Average METEOR score: 0.6388888888888888



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence 11
Original Tamil text: என்ன இது மாற்றமோ ?

Original English Conversion: What change is this?

Translated to English: Is this change?
Average BLEU score: 0.056729049472153764
Average GLEU score: 0.19250985738679702
Average METEOR score: 0.3461015802724724

Translating sentence 21
Original Tamil text: கடாயில் எண்ணெய் ஊற்றி காய்ந்ததும் அதில் மிளகு, கறிவேப்பிலை, வெங்காயம் சேர்த்து தாளித்து இஞ்சி, பூண்டு விழுதுசேர்த்து நன்கு வதக்க வேண்டும்.

Original English Conversion: Once the rice is cooked and sticky, heat oil in a pan and saute garlic, leek, celery and add the sticky rice to it.

Translated to English: Heat oil in a pan, add mustard seeds, when it splutters, add onion, ginger, garlic, green chilies and curry leaves.
Average BLEU score: 0.045077185527924765
Average GLEU score: 0.1763025932825324
Average METEOR score: 0.3320112184048331

Translating sentence 31
Original Tamil text: ஆனால், அது ஓட்டம் பிடித்தது.

Original English Conversion: But it held on.

Translate

In [10]:
average_bleu_score = np.mean(indict2_bleu_scores)
average_gleu_score = np.mean(indict2_gleu_scores)
average_meteor_score = np.mean(indict2_meteor_scores)
print('Average BLEU Score for Dev Set using IndicTrans2 model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using IndicTrans2 model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using IndicTrans2 model: ', average_meteor_score)

Average BLEU Score for Dev Set using IndicTrans2 model:  0.10476763486991329
Average GLEU Score for Dev Set using IndicTrans2 model:  0.2413630417843907
Average METEOR Score for Dev Set using IndicTrans2 model:  0.4140084454567599
