In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.meteor_score import meteor_score
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer 

In [2]:
import re
from sklearn.model_selection import train_test_split

In [3]:
english_scripts_file_path = "../datasets/Samantar dataset/final_data/en-hi/train.en"
native_scripts_file_path = "../datasets/Samantar dataset/final_data/en-hi/train.hi"

In [4]:
native_lines = []
with open(native_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        native_lines.append(line)

english_lines = []
with open(english_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        english_lines.append(line)

In [5]:
native_lines[1]

'यूरोपीय संघ के महत्वपूर्ण संस्थानों में यूरोपियन कमीशन, यूरोपीय संसद, यूरोपीय संघ परिषद, यूरोपीय न्यायलय एवं यूरोपियन सेंट्रल बैंक इत्यादि शामिल हैं।\n'

In [6]:
english_lines[1]

'The European Union has seven principal decision-making bodies, its institutions: the European Parliament, the European Council, the Council of the European Union, the European Commission, the Court of Justice of the European Union, the European Central Bank and the European Court of Auditors.\n'

In [7]:
len(native_lines)

8568307

In [8]:
len(english_lines)

8568307

In [9]:
df = pd.DataFrame({'Hindi': native_lines, 'English': english_lines})
print(df.head())

                                               Hindi  \
0        जिसके जवाब में पाक ने अच्छी शुरुआत की थी.\n   
1  यूरोपीय संघ के महत्वपूर्ण संस्थानों में यूरोपि...   
2  कांग्रेस नेता तमिलनाडु से शिवगंगा लोकसभा क्षेत...   
3  संबंधन प्रयास के बारे में उपयोक्ता को प्रांप्ट...   
4  वित्त मंत्री ने घोषणा कि जमा बीमा और ऋण गारंटी...   

                                             English  
0     In reply, Pakistan got off to a solid start.\n  
1  The European Union has seven principal decisio...  
2  The Congress leader represents Sivaganga Lok S...  
3        Prompt the user about connection attempts\n  
4  Further, the Minister announced that Deposit I...  


In [10]:
len(df)

8568307

In [11]:
dev_set = df.sample(n = 500, replace = False, random_state = np.random.randint(low=1, high=1234))
remaining_data = df.drop(dev_set.index)

test_set = remaining_data.sample(n = 1000, random_state = np.random.randint(low=1, high=1234))
print(f"Test set size: {len(test_set)}")
print(f"Dev set size: {len(dev_set)}")

Test set size: 1000
Dev set size: 500


In [12]:
native_hindi_sentences_dev_set = dev_set['Hindi'].values
print(type(native_hindi_sentences_dev_set))
print(native_hindi_sentences_dev_set.shape)
english_sentences_dev_set = dev_set['English'].values

<class 'numpy.ndarray'>
(500,)


In [13]:
print(len(english_sentences_dev_set))

500


In [14]:
# Native hindi to english translations using the Facebook model
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

facebook_bleu_scores = []
facebook_gleu_scores = []
facebook_meteor_scores = []

# translate Hindi to English
tokenizer.src_lang = "hi"
for i, hindi_text in enumerate(native_hindi_sentences_dev_set):
    # print(i)
    encoded_hi = tokenizer(hindi_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    # print(translated_text)
    # print(translated_text[0].split())
    reference = [english_sentences_dev_set[i].split()]
    transformed_tokens = translated_text[0].split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    facebook_bleu_scores.append(bleu_score)
    facebook_gleu_scores.append(gleu_score)
    facebook_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Hindi text :', hindi_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translated_text[0])
        print('Average BLEU score :', np.mean(facebook_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(facebook_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(facebook_meteor_scores), '\n')
        print()

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence  1
Original Hindi text : उसकी पत्नी ने एक लड़के को जन्म दिया है.

Original English Conversion : His wife delivers a baby boy.

Translated to English : His wife has given birth to a boy.
Average BLEU score : 7.711523862191631e-155 

Average GLEU score : 0.19230769230769232 

Average METEOR score : 0.5090725806451614 




The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence  11
Original Hindi text : पीएम मोदी ने दी नयी परिभाषा

Original English Conversion : PM Modi kicks off new term

Translated to English : PM Modi has given a new definition
Average BLEU score : 0.0358366358434621 

Average GLEU score : 0.14102967065293978 

Average METEOR score : 0.35461355284418833 


Translating sentence  21
Original Hindi text : मुख्य प्रथा जुआन रोड्रिग्ज़ फ्रेल के द्वारा लिखी गई पुस्तक एल कार्नेरो की कथाओं में पाई जाती है।

Original English Conversion : The original narrative can be found in the rambling chronicle El Carnero of Juan Rodriguez Freyle.

Translated to English : The main practice is found in El Carnero's stories in the book written by Juan Rodriguez Frell.
Average BLEU score : 0.07382846224997847 

Average GLEU score : 0.1703872009804717 

Average METEOR score : 0.37899086065223936 


Translating sentence  31
Original Hindi text : आम तौर से दो दवाएं उपयोग की जाती हैं जिनमें से एक अक्सर प्लेटिनम आधारित (सिस्प्लाटिन या कार्बोप्लाटिन) 

In [29]:
average_bleu_score = np.mean(facebook_bleu_scores)
average_gleu_score = np.mean(facebook_gleu_scores)
average_meteor_score = np.mean(facebook_meteor_scores)
print('Average BLEU Score for Dev Set using Facebooks model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using Facebooks model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using Facebooks model: ', average_meteor_score)

Average BLEU Score for Dev Set using Facebooks model:  0.08167956371696915
Average GLEU Score for Dev Set using Facebooks model:  0.19350009450494746
Average METEOR Score for Dev Set using Facebooks model:  0.39126703007862956


In [17]:
# Native hindi to english translations using the Facebook's MBART model
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

mbart_bleu_scores = []
mbart_gleu_scores = []
mbart_meteor_scores = []

# translate Hindi to English
tokenizer.src_lang = "hi_IN"
for i, hindi_text in enumerate(native_hindi_sentences_dev_set):
    # print(i)
    encoded_hi = tokenizer(hindi_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id['en_XX'])
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    # print(translated_text)
    # print(translated_text[0].split())
    reference = [english_sentences_dev_set[i].split()]
    transformed_tokens = translated_text[0].split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    mbart_bleu_scores.append(bleu_score)
    mbart_gleu_scores.append(gleu_score)
    mbart_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Hindi text :', hindi_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translated_text[0])
        print('Average BLEU score :', np.mean(mbart_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(mbart_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(mbart_meteor_scores), '\n')
        print()

Translating sentence  1
Original Hindi text : उसकी पत्नी ने एक लड़के को जन्म दिया है.

Original English Conversion : His wife delivers a baby boy.

Translated to English : His wife has given birth to a boy.
Average BLEU score : 7.711523862191631e-155 

Average GLEU score : 0.19230769230769232 

Average METEOR score : 0.5090725806451614 


Translating sentence  11
Original Hindi text : पीएम मोदी ने दी नयी परिभाषा

Original English Conversion : PM Modi kicks off new term

Translated to English : PM Modi gave a new definition
Average BLEU score : 0.07353618779524558 

Average GLEU score : 0.17826428272623704 

Average METEOR score : 0.431765867933693 


Translating sentence  21
Original Hindi text : मुख्य प्रथा जुआन रोड्रिग्ज़ फ्रेल के द्वारा लिखी गई पुस्तक एल कार्नेरो की कथाओं में पाई जाती है।

Original English Conversion : The original narrative can be found in the rambling chronicle El Carnero of Juan Rodriguez Freyle.

Translated to English : The main tradition is found in the stories

In [28]:
average_bleu_score = np.mean(mbart_bleu_scores)
average_gleu_score = np.mean(mbart_gleu_scores)
average_meteor_score = np.mean(mbart_meteor_scores)
print('Average BLEU Score for Dev Set using Facebooks mBART model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using Facebooks mBART model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using Facebooks mBART model: ', average_meteor_score)

Average BLEU Score for Dev Set using Facebooks mBART model:  0.11132998186862958
Average GLEU Score for Dev Set using Facebooks mBART model:  0.23085949016774981
Average METEOR Score for Dev Set using Facebooks mBART model:  0.4525925927984988


In [19]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor


model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

ip = IndicProcessor(inference=True)

input_sentences = [
    "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
    "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
    "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
    "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
]

src_lang, tgt_lang = "hin_Deva", "eng_Latn"

batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenize the sentences and generate input encodings
inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
).to(DEVICE)

# Generate translations using the model
with torch.no_grad():
    generated_tokens = model.generate(
        **inputs,
        use_cache=True,
        min_length=0,
        max_length=256,
        num_beams=5,
        num_return_sequences=1,
    )

# Decode the generated tokens into text
with tokenizer.as_target_tokenizer():
    generated_tokens = tokenizer.batch_decode(
        generated_tokens.detach().cpu().tolist(),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

# Postprocess the translations, including entity replacement
translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)

for input_sentence, translation in zip(input_sentences, translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")


hin_Deva: जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।
eng_Latn: When I was young, I used to go to the park every day.
hin_Deva: हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।
eng_Latn: We saw a new movie last week that was very inspiring.
hin_Deva: अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।
eng_Latn: If you'd given me a pass at that time, we'd have gone out to eat.
hin_Deva: मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।
eng_Latn: My friend has invited me to her birthday party, and I'll give her a present.




In [20]:
import torch
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor

# Initialize metrics
indict2_bleu_scores = []
indict2_gleu_scores = []
indict2_meteor_scores = []

# Model and tokenizer setup
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "hin_Deva", "eng_Latn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Convert numpy array to a list
native_hindi_sentences = native_hindi_sentences_dev_set.tolist()
english_sentences = english_sentences_dev_set.tolist()

# Ensure the datasets are aligned
assert len(native_hindi_sentences) == len(english_sentences), "Dev sets must be of equal length."

for i, hindi_text in enumerate(native_hindi_sentences):
    try:
        # print(f"Processing sentence {i+1}...")
        
        # Preprocessing
        batch = ip.preprocess_batch(
            [hindi_text],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )
        # print(f"Preprocessed batch: {batch}")
        
        # Tokenize the sentences and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=1,
                num_return_sequences=1,
            )
        # print(f"Generated tokens: {generated_tokens}")

        # Decode the generated tokens into text
        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
        translated_text = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0]
        # print(f"Translated text: {translated_text}")

        reference = [english_sentences_dev_set[i].split()]
        transformed_tokens = translated_text.split()

        bleu_score = sentence_bleu(reference, transformed_tokens)
        gleu_score = sentence_gleu(reference, transformed_tokens)
        met_score = meteor_score(reference, transformed_tokens)

        indict2_bleu_scores.append(bleu_score)
        indict2_gleu_scores.append(gleu_score)
        indict2_meteor_scores.append(met_score)

        # Log progress every 10 sentences
        if i % 10 == 0:
            print("Translating sentence", i + 1)
            print("Original Hindi text:", hindi_text)
            print("Original English Conversion:", english_sentences[i])
            print("Translated to English:", translated_text)
            print("Average BLEU score:", np.mean(indict2_bleu_scores))
            print("Average GLEU score:", np.mean(indict2_gleu_scores))
            print("Average METEOR score:", np.mean(indict2_meteor_scores))
            print()

    except Exception as e:
        print(f"Error processing sentence {i+1}: {e}")
        continue


Translating sentence 1
Original Hindi text: उसकी पत्नी ने एक लड़के को जन्म दिया है.

Original English Conversion: His wife delivers a baby boy.

Translated to English: His wife gave birth to a baby boy.
Average BLEU score: 5.614021910443866e-78
Average GLEU score: 0.34615384615384615
Average METEOR score: 0.7806451612903227

Translating sentence 11
Original Hindi text: पीएम मोदी ने दी नयी परिभाषा

Original English Conversion: PM Modi kicks off new term

Translated to English: PM Modi has given a new definition to the word.
Average BLEU score: 0.14237767459099188
Average GLEU score: 0.25227348556664414
Average METEOR score: 0.5070000468927506

Translating sentence 21
Original Hindi text: मुख्य प्रथा जुआन रोड्रिग्ज़ फ्रेल के द्वारा लिखी गई पुस्तक एल कार्नेरो की कथाओं में पाई जाती है।

Original English Conversion: The original narrative can be found in the rambling chronicle El Carnero of Juan Rodriguez Freyle.

Translated to English: The main practice is found in the book El Carnero's Fa

In [27]:
average_bleu_score = np.mean(indict2_bleu_scores)
average_gleu_score = np.mean(indict2_gleu_scores)
average_meteor_score = np.mean(indict2_meteor_scores)
print('Average BLEU Score for Dev Set using IndicTrans2 model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using IndicTrans2 model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using IndicTrans2 model: ', average_meteor_score)

Average BLEU Score for Dev Set using IndicTrans2 model:  0.1897920283826145
Average GLEU Score for Dev Set using IndicTrans2 model:  0.3062570258729637
Average METEOR Score for Dev Set using IndicTrans2 model:  0.5301611172660362


In [32]:
# Opus MarianMT
from transformers import pipeline
OpusMT_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

opusMT_bleu_scores = []
opusMT_gleu_scores = []
opusMT_meteor_scores = []

for i, hindi_text in enumerate(native_hindi_sentences_dev_set):

    translation = OpusMT_translator(hindi_text)[0]["translation_text"]
    reference = [english_sentences_dev_set[i].split()]
    transformed_tokens = translation.split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    opusMT_bleu_scores.append(bleu_score)
    opusMT_gleu_scores.append(gleu_score)
    opusMT_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Hindi text :', hindi_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translation)
        print('Average BLEU score :', np.mean(opusMT_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(opusMT_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(opusMT_meteor_scores), '\n')
        print()

Translating sentence  1
Original Hindi text : उसकी पत्नी ने एक लड़के को जन्म दिया है.

Original English Conversion : His wife delivers a baby boy.

Translated to English : His wife has given birth to a boy.
Average BLEU score : 7.711523862191631e-155 

Average GLEU score : 0.19230769230769232 

Average METEOR score : 0.5090725806451614 


Translating sentence  11
Original Hindi text : पीएम मोदी ने दी नयी परिभाषा

Original English Conversion : PM Modi kicks off new term

Translated to English : Cannot initialise Evolution's mail component.
Average BLEU score : 0.07887340478026815 

Average GLEU score : 0.16388593427882278 

Average METEOR score : 0.36255486306072393 


Translating sentence  21
Original Hindi text : मुख्य प्रथा जुआन रोड्रिग्ज़ फ्रेल के द्वारा लिखी गई पुस्तक एल कार्नेरो की कथाओं में पाई जाती है।

Original English Conversion : The original narrative can be found in the rambling chronicle El Carnero of Juan Rodriguez Freyle.

Translated to English : The book El Carnero, by 

In [33]:
average_bleu_score = np.mean(opusMT_bleu_scores)
average_gleu_score = np.mean(opusMT_gleu_scores)
average_meteor_score = np.mean(opusMT_meteor_scores)
print('Average BLEU Score for Dev Set using OpusMT mul-en model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using OpusMT mul-en model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using OpusMT mul-en model: ', average_meteor_score)

Average BLEU Score for Dev Set using OpusMT mul-en model:  0.06472286856525937
Average GLEU Score for Dev Set using OpusMT mul-en model:  0.16009251164121446
Average METEOR Score for Dev Set using OpusMT mul-en model:  0.3138617101674688
