In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.meteor_score import meteor_score
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer 

In [2]:
import re
from sklearn.model_selection import train_test_split

In [3]:
english_scripts_file_path = "../datasets/Samantar dataset/final_data/en-ta/train.en"
native_scripts_file_path = "../datasets/Samantar dataset/final_data/en-ta/train.ta"

In [4]:
native_lines = []
with open(native_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        native_lines.append(line)

english_lines = []
with open(english_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        english_lines.append(line)

In [5]:
native_lines[1]

'ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.\n'

In [6]:
english_lines[1]

'Every tournament is difficult.\n'

In [7]:
len(native_lines)

5167241

In [8]:
len(english_lines)

5167241

In [9]:
df = pd.DataFrame({'Tamil': native_lines, 'English': english_lines})
print(df.head())

                                               Tamil  \
0                  என்றுதான் நான் சொல்ல வருகிறேன்.\n   
1               ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.\n   
2  பல வருடங்களாக அவர் அந்த நித்திய எரிநரக தண்டனைய...   
3  அவர் நிதி அமைச்சர் அருண்ஜேட்லியின் முயற்சியை த...   
4  சில கலை வரலாற்றாசிரியர்கள் அவர் ஒரு வருடத்திற்...   

                                             English  
0                         That's what I am saying.\n  
1                   Every tournament is difficult.\n  
2  One of the first questions Flavio posed was, D...  
3  He gave full credit to the Union Finance Minis...  
4  Some art historians have suggested that he onl...  


In [10]:
len(df)

5167241

In [11]:
dev_set = df.sample(n = 500, replace = False, random_state = np.random.randint(low=1, high=1234))
remaining_data = df.drop(dev_set.index)

test_set = remaining_data.sample(n = 1000, random_state = np.random.randint(low=1, high=1234))
print(f"Test set size: {len(test_set)}")
print(f"Dev set size: {len(dev_set)}")

Test set size: 1000
Dev set size: 500


In [12]:
native_tamil_sentences_dev_set = dev_set['Tamil'].values
print(type(native_tamil_sentences_dev_set))
print(native_tamil_sentences_dev_set.shape)
english_sentences_dev_set = dev_set['English'].values

<class 'numpy.ndarray'>
(500,)


In [13]:
print(len(english_sentences_dev_set))

500


In [14]:
def calculate_scores(english_sentence, translation):
    reference = [english_sentence.split()]
    transformed_tokens = translation.split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    return bleu_score, gleu_score, met_score

In [15]:
# Native hindi to english translations using the Facebook model
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

m2m100_bleu_scores = []
m2m100_gleu_scores = []
m2m100_meteor_scores = []

# translate Hindi to English
tokenizer.src_lang = "ta"
for i, tamil_text in enumerate(native_tamil_sentences_dev_set):

    encoded_ta = tokenizer(tamil_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_ta, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    bleu_score, gleu_score, met_score = calculate_scores(english_sentences_dev_set[i], translation)

    m2m100_bleu_scores.append(bleu_score)
    m2m100_gleu_scores.append(gleu_score)
    m2m100_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Tamil text :', tamil_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translation)
        print('Average BLEU score :', np.mean(m2m100_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(m2m100_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(m2m100_meteor_scores), '\n')
        print()

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence  1
Original Tamil text : சாரங்கி வாசிப்பது அவ்வளவு சுலபம் இல்லை.

Original English Conversion : Its not easy reading.

Translated to English : It’s not so hard to get a lot of money.
Average BLEU score : 1.0244914152188952e-231 

Average GLEU score : 0.029411764705882353 

Average METEOR score : 0.10869565217391304 


Translating sentence  11
Original Tamil text : 210 கோடி வசூலித்துள்ளது.

Original English Conversion : 210 crore.

Translated to English : There are 210 cents.
Average BLEU score : 0.03142916224017645 

Average GLEU score : 0.0923019543340399 

Average METEOR score : 0.2091386960288684 


Translating sentence  21
Original Tamil text : நான் பயம் விரும்புகிறேன்.

Original English Conversion : I like the fear.

Translated to English : I want fear.
Average BLEU score : 0.016462894506759095 

Average GLEU score : 0.0909226072551541 

Average METEOR score : 0.18956725710783084 


Translating sentence  31
Original Tamil text : பிரச்சினை எல்லாம் தீர்க்கப்பட வ

In [17]:
average_bleu_score = np.mean(m2m100_bleu_scores)
average_gleu_score = np.mean(m2m100_gleu_scores)
average_meteor_score = np.mean(m2m100_meteor_scores)
print('Average BLEU Score for Dev Set using Facebooks M2M2100 model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using Facebooks M2M100 model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using Facebooks M2M100 model: ', average_meteor_score)

Average BLEU Score for Dev Set using Facebooks M2M2100 model:  0.015776594402065847
Average GLEU Score for Dev Set using Facebooks M2M100 model:  0.08141351290061871
Average METEOR Score for Dev Set using Facebooks M2M100 model:  0.1870810822993451


In [20]:
# Native hindi to english translations using the Facebook's MBART model
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

mbart_bleu_scores = []
mbart_gleu_scores = []
mbart_meteor_scores = []

# translate Tamil to English
tokenizer.src_lang = "ta_IN"
for i, tamil_text in enumerate(native_tamil_sentences_dev_set):
    encoded_ta = tokenizer(tamil_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_ta, forced_bos_token_id=tokenizer.lang_code_to_id['en_XX'])
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
    bleu_score, gleu_score, met_score = calculate_scores(english_sentences_dev_set[i], translation)

    mbart_bleu_scores.append(bleu_score)
    mbart_gleu_scores.append(gleu_score)
    mbart_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Tamil text :', tamil_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translation)
        print('Average BLEU score :', np.mean(mbart_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(mbart_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(mbart_meteor_scores), '\n')
        print()

Translating sentence  1
Original Tamil text : சாரங்கி வாசிப்பது அவ்வளவு சுலபம் இல்லை.

Original English Conversion : Its not easy reading.

Translated to English : It's not that easy to read a car.
Average BLEU score : 1.2882297539194154e-231 

Average GLEU score : 0.07692307692307693 

Average METEOR score : 0.22727272727272727 


Translating sentence  11
Original Tamil text : 210 கோடி வசூலித்துள்ளது.

Original English Conversion : 210 crore.

Translated to English : 210 crore.
Average BLEU score : 0.02388099904021962 

Average GLEU score : 0.1924608368458636 

Average METEOR score : 0.37767832022564213 


Translating sentence  21
Original Tamil text : நான் பயம் விரும்புகிறேன்.

Original English Conversion : I like the fear.

Translated to English : I want to be afraid.
Average BLEU score : 0.050957526058052593 

Average GLEU score : 0.18751547402318053 

Average METEOR score : 0.35532152152049645 


Translating sentence  31
Original Tamil text : பிரச்சினை எல்லாம் தீர்க்கப்பட வேணும்.


In [21]:
average_bleu_score = np.mean(mbart_bleu_scores)
average_gleu_score = np.mean(mbart_gleu_scores)
average_meteor_score = np.mean(mbart_meteor_scores)
print('Average BLEU Score for Dev Set using Facebooks mBART model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using Facebooks mBART model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using Facebooks mBART model: ', average_meteor_score)

Average BLEU Score for Dev Set using Facebooks mBART model:  0.05498409590800934
Average GLEU Score for Dev Set using Facebooks mBART model:  0.16362644131676832
Average METEOR Score for Dev Set using Facebooks mBART model:  0.3247924788070061


In [22]:
import torch
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor

# Initialize metrics
indict2_bleu_scores = []
indict2_gleu_scores = []
indict2_meteor_scores = []

# Model and tokenizer setup
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "tam_Taml", "eng_Latn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Convert numpy array to a list
native_tamil_sentences = native_tamil_sentences_dev_set.tolist()
english_sentences = english_sentences_dev_set.tolist()

# Ensure the datasets are aligned
assert len(native_tamil_sentences) == len(english_sentences), "Dev sets must be of equal length."

for i, tamil_text in enumerate(native_tamil_sentences):
    try:
        # print(f"Processing sentence {i+1}...")
        
        # Preprocessing
        batch = ip.preprocess_batch(
            [tamil_text],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )
        # print(f"Preprocessed batch: {batch}")
        
        # Tokenize the sentences and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=1,
                num_return_sequences=1,
            )
        # print(f"Generated tokens: {generated_tokens}")

        # Decode the generated tokens into text
        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
        translation = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0]
        bleu_score, gleu_score, met_score = calculate_scores(english_sentences_dev_set[i], translation)

        indict2_bleu_scores.append(bleu_score)
        indict2_gleu_scores.append(gleu_score)
        indict2_meteor_scores.append(met_score)

        # Log progress every 10 sentences
        if i % 10 == 0:
            print("Translating sentence", i + 1)
            print("Original Tamil text:", tamil_text)
            print("Original English Conversion:", english_sentences[i])
            print("Translated to English:", translation)
            print("Average BLEU score:", np.mean(indict2_bleu_scores))
            print("Average GLEU score:", np.mean(indict2_gleu_scores))
            print("Average METEOR score:", np.mean(indict2_meteor_scores))
            print()

    except Exception as e:
        print(f"Error processing sentence {i+1}: {e}")
        continue




Translating sentence 1
Original Tamil text: சாரங்கி வாசிப்பது அவ்வளவு சுலபம் இல்லை.

Original English Conversion: Its not easy reading.

Translated to English: It is not easy to read the Sarangi.
Average BLEU score: 6.484592771860512e-155
Average GLEU score: 0.11538461538461539
Average METEOR score: 0.5808080808080809

Translating sentence 11
Original Tamil text: 210 கோடி வசூலித்துள்ளது.

Original English Conversion: 210 crore.

Translated to English: 210 crores.
Average BLEU score: 0.0805639056465013
Average GLEU score: 0.19156091117813603
Average METEOR score: 0.38968411342302184

Translating sentence 21
Original Tamil text: நான் பயம் விரும்புகிறேன்.

Original English Conversion: I like the fear.

Translated to English: I like the fear.
Average BLEU score: 0.1456564005148834
Average GLEU score: 0.23342938242165226
Average METEOR score: 0.4011436649063266

Translating sentence 31
Original Tamil text: பிரச்சினை எல்லாம் தீர்க்கப்பட வேணும்.

Original English Conversion: All problems must

In [23]:
average_bleu_score = np.mean(indict2_bleu_scores)
average_gleu_score = np.mean(indict2_gleu_scores)
average_meteor_score = np.mean(indict2_meteor_scores)
print('Average BLEU Score for Dev Set using IndicTrans2 model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using IndicTrans2 model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using IndicTrans2 model: ', average_meteor_score)

Average BLEU Score for Dev Set using IndicTrans2 model:  0.11736283037882865
Average GLEU Score for Dev Set using IndicTrans2 model:  0.24793760969661255
Average METEOR Score for Dev Set using IndicTrans2 model:  0.425951905439129


In [24]:
# Opus MarianMT
from transformers import pipeline
OpusMT_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

opusMT_bleu_scores = []
opusMT_gleu_scores = []
opusMT_meteor_scores = []

for i, tamil_text in enumerate(native_tamil_sentences_dev_set):

    translation = OpusMT_translator(tamil_text)[0]["translation_text"]
    bleu_score, gleu_score, met_score = calculate_scores(english_sentences_dev_set[i], translation)

    opusMT_bleu_scores.append(bleu_score)
    opusMT_gleu_scores.append(gleu_score)
    opusMT_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Tamil text :', tamil_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translation)
        print('Average BLEU score :', np.mean(opusMT_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(opusMT_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(opusMT_meteor_scores), '\n')
        print()

Translating sentence  1
Original Tamil text : சாரங்கி வாசிப்பது அவ்வளவு சுலபம் இல்லை.

Original English Conversion : Its not easy reading.

Translated to English : It is not so easy to read aloud.
Average BLEU score : 1.2882297539194154e-231 

Average GLEU score : 0.07692307692307693 

Average METEOR score : 0.34090909090909094 


Translating sentence  11
Original Tamil text : 210 கோடி வசூலித்துள்ளது.

Original English Conversion : 210 crore.

Translated to English : It is worth 210 million dollars.
Average BLEU score : 0.03801274224112792 

Average GLEU score : 0.08942669579163644 

Average METEOR score : 0.23569669821979228 


Translating sentence  21
Original Tamil text : நான் பயம் விரும்புகிறேன்.

Original English Conversion : I like the fear.

Translated to English : I want to be scared.
Average BLEU score : 0.029506396486812043 

Average GLEU score : 0.09878293886531438 

Average METEOR score : 0.2507784915834043 


Translating sentence  31
Original Tamil text : பிரச்சினை எல்லாம்

In [25]:
average_bleu_score = np.mean(opusMT_bleu_scores)
average_gleu_score = np.mean(opusMT_gleu_scores)
average_meteor_score = np.mean(opusMT_meteor_scores)
print('Average BLEU Score for Dev Set using OpusMT mul-en model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using OpusMT mul-en model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using OpusMT mul-en model: ', average_meteor_score)

Average BLEU Score for Dev Set using OpusMT mul-en model:  0.028092826067396295
Average GLEU Score for Dev Set using OpusMT mul-en model:  0.10745585386776603
Average METEOR Score for Dev Set using OpusMT mul-en model:  0.24187734196922678
