In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.meteor_score import meteor_score
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer 

In [2]:
import re
from sklearn.model_selection import train_test_split

In [3]:
english_scripts_file_path = "../datasets/Samantar dataset/final_data/en-ta/train.en"
native_scripts_file_path = "../datasets/Samantar dataset/final_data/en-ta/train.ta"

In [4]:
native_lines = []
with open(native_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        native_lines.append(line)

english_lines = []
with open(english_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        english_lines.append(line)

In [5]:
native_lines[1]

'ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.\n'

In [6]:
english_lines[1]

'Every tournament is difficult.\n'

In [7]:
len(native_lines)

5167241

In [8]:
len(english_lines)

5167241

In [9]:
df = pd.DataFrame({'Hindi': native_lines, 'English': english_lines})
print(df.head())

                                               Hindi  \
0                  என்றுதான் நான் சொல்ல வருகிறேன்.\n   
1               ஒவ்வொரு சுற்றுப்பயணமும் கடினமானது.\n   
2  பல வருடங்களாக அவர் அந்த நித்திய எரிநரக தண்டனைய...   
3  அவர் நிதி அமைச்சர் அருண்ஜேட்லியின் முயற்சியை த...   
4  சில கலை வரலாற்றாசிரியர்கள் அவர் ஒரு வருடத்திற்...   

                                             English  
0                         That's what I am saying.\n  
1                   Every tournament is difficult.\n  
2  One of the first questions Flavio posed was, D...  
3  He gave full credit to the Union Finance Minis...  
4  Some art historians have suggested that he onl...  


In [10]:
len(df)

5167241

In [11]:
dev_set = df.sample(n = 500, replace = False, random_state = np.random.randint(low=1, high=1234))
remaining_data = df.drop(dev_set.index)

test_set = remaining_data.sample(n = 1000, random_state = np.random.randint(low=1, high=1234))
print(f"Test set size: {len(test_set)}")
print(f"Dev set size: {len(dev_set)}")

Test set size: 1000
Dev set size: 500


In [12]:
native_tamil_sentences_dev_set = dev_set['Hindi'].values
print(type(native_tamil_sentences_dev_set))
print(native_tamil_sentences_dev_set.shape)
english_sentences_dev_set = dev_set['English'].values

<class 'numpy.ndarray'>
(500,)


In [13]:
print(len(english_sentences_dev_set))

500


In [14]:
# Native hindi to english translations using the Facebook model
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

facebook_bleu_scores = []
facebook_gleu_scores = []
facebook_meteor_scores = []

# translate Hindi to English
tokenizer.src_lang = "ta"
for i, tamil_text in enumerate(native_tamil_sentences_dev_set):
    # print(i)
    encoded_ta = tokenizer(tamil_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_ta, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    # print(translated_text)
    # print(translated_text[0].split())
    reference = [english_sentences_dev_set[i].split()]
    transformed_tokens = translated_text[0].split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    facebook_bleu_scores.append(bleu_score)
    facebook_gleu_scores.append(gleu_score)
    facebook_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Tamil text :', tamil_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translated_text[0])
        print('Average BLEU score :', np.mean(facebook_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(facebook_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(facebook_meteor_scores), '\n')
        print()

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence  1
Original Tamil text : மாண்புமிகு வணிகவரி மற்றும் பதிவுத் துறை அமைச்சர் அவர்களால் 21.09.2015 அன்று சட்டமன்றப் பேரவையில் வெளியிடப்பட்ட அறிவிப்பிற்கிணங்க, ரூ.5,00,43,500/மதிப்பீட்டில் வணிகவரி அலுவலகக் கட்டடங்களில் உட்கட்டமைப்பு வசதிகளை மேம்படுத்தவும்மேம்படுத்தவும்,, முழுமையான கணினி மயமாக்கல் திட்டத்தின் கீழ் அமைக்கப்பட்ட விலை உயர்ந்த தகவல் தொழில்நுட்ப சாதனங்களைப் பாதுகாக்கும் வகையில் வணிகவரி அலுவலகக் கட்டடங்களை பொதுப்பணித்துறை மூலம் பழுது பார்ப்பதற்கும், பராமரிப்பதற்கும், அலுவலகங்களுக்கு தேவையான தளவாடங்களை டான்சி மூலம் கொள்முதல் செய்வதற்கும் நிதி ஒடுக்கீடு செய்யப்பட்டுள்ளது.

Original English Conversion : As announced by the Hon'ble Minister for Commercial Taxes and Registration during the demand for grants for the year 2015-16 in the Assembly on 21.09.2015, the Government have accorded sanction for a sum of Rs.10.00 crore for construction of an Integrated Office Complex at Survey No.375No.375//11,, Varadharajapuram village, Nazarathpettai in Poonamallee Taluk of T

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence  11
Original Tamil text : இம்ரான் கான், முன்னாள் பிரதமர் அப்பாஸி வேட்புமனுக்கள் தள்ளுபடி

Original English Conversion : Former Pak PM Abbasi calls for elections to remove Imran Khan govt

Translated to English : And he was the one who was the one who was the one who was the one who was the one who was the one.
Average BLEU score : 9.5824471484138e-156 

Average GLEU score : 0.039133207099052505 

Average METEOR score : 0.09992419415908547 


Translating sentence  21
Original Tamil text : 29 மாநிலங்கள் மற்றும் 2 யூனியன் பிரதேசங்களைச் சேர்ந்த அறிவியல்-தொழில்நுட்பக் கவுன்சில்களைச் சேர்ந்த அதிகாரிகள், இந்த தொடக்க நிகழ்ச்சியில் கலந்துகொண்டனர்.

Original English Conversion : Officials of S& T Councils from 29 States and 2 Union Territories (UTs) participated during the inaugural session

Translated to English : Officers involved in the 29 counties and 2 June counties of the Scientific and Communications Council, attended this opening event.
Average BLEU score : 7.7540312

In [15]:
average_bleu_score = np.mean(facebook_bleu_scores)
average_gleu_score = np.mean(facebook_gleu_scores)
average_meteor_score = np.mean(facebook_meteor_scores)
print('Average BLEU Score for Test Set using Facebooks model: ', average_bleu_score)
print('Average GLEU Score for Test Set using Facebooks model: ', average_gleu_score)
print('Average METEOR Score for Test Set using Facebooks model: ', average_meteor_score)

Average BLEU Score for Test Set using Facebooks model:  0.011616463774013108
Average GLEU Score for Test Set using Facebooks model:  0.0735773947236834
Average METEOR Score for Test Set using Facebooks model:  0.16908943906807566


In [16]:
import accelerate
print(accelerate.__version__)

1.1.1


In [17]:
"""
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = 'jbochi/madlad400-3b-mt'
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
tokenizer = T5Tokenizer.from_pretrained(model_name)

google_bleu_scores = []
google_gleu_scores = []
google_meteor_scores = []

for i, hindi_text in enumerate(native_hindi_sentences_dev_set):
    # print(i)
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids=input_ids)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reference = [english_sentences_dev_set[i].split()]
    transformed_tokens = translated_text.split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    google_bleu_scores.append(bleu_score)
    google_gleu_scores.append(gleu_score)
    google_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Hindi text :', hindi_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translated_text[0])
        print('Average BLEU score :', np.mean(google_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(google_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(google_meteor_scores), '\n')
        print()
"""

'\nfrom transformers import T5ForConditionalGeneration, T5Tokenizer\n\nmodel_name = \'jbochi/madlad400-3b-mt\'\nmodel = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")\ntokenizer = T5Tokenizer.from_pretrained(model_name)\n\ngoogle_bleu_scores = []\ngoogle_gleu_scores = []\ngoogle_meteor_scores = []\n\nfor i, hindi_text in enumerate(native_hindi_sentences_dev_set):\n    # print(i)\n    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)\n    outputs = model.generate(input_ids=input_ids)\n    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n    reference = [english_sentences_dev_set[i].split()]\n    transformed_tokens = translated_text.split()\n\n    bleu_score = sentence_bleu(reference, transformed_tokens)\n    gleu_score = sentence_gleu(reference, transformed_tokens)\n    met_score = meteor_score(reference, transformed_tokens)\n\n    google_bleu_scores.append(bleu_score)\n    google_gleu_scores.append(gleu_score

In [18]:
"""
average_bleu_score = np.mean(google_bleu_scores)
average_gleu_score = np.mean(google_gleu_scores)
average_meteor_score = np.mean(google_meteor_scores)
print('Average BLEU Score for Test Set using Googles model: ', average_bleu_score)
print('Average GLEU Score for Test Set using Googles model: ', average_gleu_score)
print('Average METEOR Score for Test Set using Googles model: ', average_meteor_score)
"""

"\naverage_bleu_score = np.mean(google_bleu_scores)\naverage_gleu_score = np.mean(google_gleu_scores)\naverage_meteor_score = np.mean(google_meteor_scores)\nprint('Average BLEU Score for Test Set using Googles model: ', average_bleu_score)\nprint('Average GLEU Score for Test Set using Googles model: ', average_gleu_score)\nprint('Average METEOR Score for Test Set using Googles model: ', average_meteor_score)\n"

In [19]:
import torch
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor

# Initialize metrics
indict2_bleu_scores = []
indict2_gleu_scores = []
indict2_meteor_scores = []

# Model and tokenizer setup
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "tam_Taml", "eng_Latn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Convert numpy array to a list
native_tamil_sentences = native_tamil_sentences_dev_set.tolist()
english_sentences = english_sentences_dev_set.tolist()

# Ensure the datasets are aligned
assert len(native_tamil_sentences) == len(english_sentences), "Dev sets must be of equal length."

for i, tamil_text in enumerate(native_tamil_sentences):
    try:
        # print(f"Processing sentence {i+1}...")
        
        # Preprocessing
        batch = ip.preprocess_batch(
            [tamil_text],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )
        # print(f"Preprocessed batch: {batch}")
        
        # Tokenize the sentences and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=1,
                num_return_sequences=1,
            )
        # print(f"Generated tokens: {generated_tokens}")

        # Decode the generated tokens into text
        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
        translated_text = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0]
        # print(f"Translated text: {translated_text}")

        reference = [english_sentences_dev_set[i].split()]
        transformed_tokens = translated_text.split()

        bleu_score = sentence_bleu(reference, transformed_tokens)
        gleu_score = sentence_gleu(reference, transformed_tokens)
        met_score = meteor_score(reference, transformed_tokens)

        indict2_bleu_scores.append(bleu_score)
        indict2_gleu_scores.append(gleu_score)
        indict2_meteor_scores.append(met_score)

        # Log progress every 10 sentences
        if i % 10 == 0:
            print("Translating sentence", i + 1)
            print("Original Tamil text:", tamil_text)
            print("Original English Conversion:", english_sentences[i])
            print("Translated to English:", translated_text)
            print("Average BLEU score:", np.mean(indict2_bleu_scores))
            print("Average GLEU score:", np.mean(indict2_gleu_scores))
            print("Average METEOR score:", np.mean(indict2_meteor_scores))
            print()

    except Exception as e:
        print(f"Error processing sentence {i+1}: {e}")
        continue




Translating sentence 1
Original Tamil text: மாண்புமிகு வணிகவரி மற்றும் பதிவுத் துறை அமைச்சர் அவர்களால் 21.09.2015 அன்று சட்டமன்றப் பேரவையில் வெளியிடப்பட்ட அறிவிப்பிற்கிணங்க, ரூ.5,00,43,500/மதிப்பீட்டில் வணிகவரி அலுவலகக் கட்டடங்களில் உட்கட்டமைப்பு வசதிகளை மேம்படுத்தவும்மேம்படுத்தவும்,, முழுமையான கணினி மயமாக்கல் திட்டத்தின் கீழ் அமைக்கப்பட்ட விலை உயர்ந்த தகவல் தொழில்நுட்ப சாதனங்களைப் பாதுகாக்கும் வகையில் வணிகவரி அலுவலகக் கட்டடங்களை பொதுப்பணித்துறை மூலம் பழுது பார்ப்பதற்கும், பராமரிப்பதற்கும், அலுவலகங்களுக்கு தேவையான தளவாடங்களை டான்சி மூலம் கொள்முதல் செய்வதற்கும் நிதி ஒடுக்கீடு செய்யப்பட்டுள்ளது.

Original English Conversion: As announced by the Hon'ble Minister for Commercial Taxes and Registration during the demand for grants for the year 2015-16 in the Assembly on 21.09.2015, the Government have accorded sanction for a sum of Rs.10.00 crore for construction of an Integrated Office Complex at Survey No.375No.375//11,, Varadharajapuram village, Nazarathpettai in Poonamallee Taluk of Tiru

In [20]:
average_bleu_score = np.mean(indict2_bleu_scores)
average_gleu_score = np.mean(indict2_gleu_scores)
average_meteor_score = np.mean(indict2_meteor_scores)
print('Average BLEU Score for Test Set using IndicTrans2 model: ', average_bleu_score)
print('Average GLEU Score for Test Set using IndicTrans2 model: ', average_gleu_score)
print('Average METEOR Score for Test Set using IndicTrans2 model: ', average_meteor_score)

Average BLEU Score for Test Set using IndicTrans2 model:  0.09696862190068785
Average GLEU Score for Test Set using IndicTrans2 model:  0.23192238550458905
Average METEOR Score for Test Set using IndicTrans2 model:  0.41127465985968154
