In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.meteor_score import meteor_score
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer 

In [2]:
import re
from sklearn.model_selection import train_test_split

In [3]:
english_scripts_file_path = "../datasets/Samantar dataset/final_data/en-hi/train.en"
native_scripts_file_path = "../datasets/Samantar dataset/final_data/en-hi/train.hi"

In [4]:
native_lines = []
with open(native_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        native_lines.append(line)

english_lines = []
with open(english_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        english_lines.append(line)

In [5]:
native_lines[1]

'यूरोपीय संघ के महत्वपूर्ण संस्थानों में यूरोपियन कमीशन, यूरोपीय संसद, यूरोपीय संघ परिषद, यूरोपीय न्यायलय एवं यूरोपियन सेंट्रल बैंक इत्यादि शामिल हैं।\n'

In [6]:
english_lines[1]

'The European Union has seven principal decision-making bodies, its institutions: the European Parliament, the European Council, the Council of the European Union, the European Commission, the Court of Justice of the European Union, the European Central Bank and the European Court of Auditors.\n'

In [7]:
len(native_lines)

8568307

In [8]:
len(english_lines)

8568307

In [9]:
df = pd.DataFrame({'Hindi': native_lines, 'English': english_lines})
print(df.head())

                                               Hindi  \
0        जिसके जवाब में पाक ने अच्छी शुरुआत की थी.\n   
1  यूरोपीय संघ के महत्वपूर्ण संस्थानों में यूरोपि...   
2  कांग्रेस नेता तमिलनाडु से शिवगंगा लोकसभा क्षेत...   
3  संबंधन प्रयास के बारे में उपयोक्ता को प्रांप्ट...   
4  वित्त मंत्री ने घोषणा कि जमा बीमा और ऋण गारंटी...   

                                             English  
0     In reply, Pakistan got off to a solid start.\n  
1  The European Union has seven principal decisio...  
2  The Congress leader represents Sivaganga Lok S...  
3        Prompt the user about connection attempts\n  
4  Further, the Minister announced that Deposit I...  


In [10]:
len(df)

8568307

In [11]:
dev_set = df.sample(n = 500, replace = False, random_state = np.random.randint(low=1, high=1234))
remaining_data = df.drop(dev_set.index)

test_set = remaining_data.sample(n = 1000, random_state = np.random.randint(low=1, high=1234))
print(f"Test set size: {len(test_set)}")
print(f"Dev set size: {len(dev_set)}")

Test set size: 1000
Dev set size: 500


In [12]:
native_hindi_sentences_dev_set = dev_set['Hindi'].values
print(type(native_hindi_sentences_dev_set))
print(native_hindi_sentences_dev_set.shape)
english_sentences_dev_set = dev_set['English'].values

<class 'numpy.ndarray'>
(500,)


In [13]:
print(len(english_sentences_dev_set))

500


In [14]:
# Native hindi to english translations using the Facebook model
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

facebook_bleu_scores = []
facebook_gleu_scores = []
facebook_meteor_scores = []

# translate Hindi to English
tokenizer.src_lang = "hi"
for i, hindi_text in enumerate(native_hindi_sentences_dev_set):
    # print(i)
    encoded_hi = tokenizer(hindi_text, return_tensors="pt")
    generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("en"))
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    # print(translated_text)
    # print(translated_text[0].split())
    reference = [english_sentences_dev_set[i].split()]
    transformed_tokens = translated_text[0].split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    facebook_bleu_scores.append(bleu_score)
    facebook_gleu_scores.append(gleu_score)
    facebook_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Hindi text :', hindi_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translated_text[0])
        print('Average BLEU score :', np.mean(facebook_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(facebook_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(facebook_meteor_scores), '\n')
        print()

Translating sentence  1
Original Hindi text : ऑस्ट्रिया से एक कम्प्यूटर प्रचालक, रोलान्ट और उसकी पत्नी यूटा, देश के गर्म, खुश्‍क दक्षिणी भाग में बस गए हैं ।

Original English Conversion : Roland, a computer operator from Austria, and his wife, Yuta, have settled in the warm, dry, southern part of the country.

Translated to English : A computer operator from Austria, Roland and his wife, Utah, have settled in the hot, southern part of the country.
Average BLEU score : 0.45605983089973623 

Average GLEU score : 0.5128205128205128 

Average METEOR score : 0.8080992036555685 




The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence  11
Original Hindi text : हालाँकि, उसने कहा कि उसे उत्तर प्रदेश पुलिस पर भरोसा नहीं है और उसने जोर देकर कहा कि वह केवल मध्य प्रदेश पुलिस के सामने आत्मसमर्पण करेगी।

Original English Conversion : However, she said that she did not trust the Uttar Pradesh Police and insisted that she would only surrender to the Madhya Pradesh Police.

Translated to English : However, he said he has no confidence in the Northern Pradesh police and he emphasized that he will only surrender to the Central Pradesh police.
Average BLEU score : 0.11104515649191442 

Average GLEU score : 0.19245703464736758 

Average METEOR score : 0.38417606895226014 


Translating sentence  21
Original Hindi text : राष्ट्रपति के दौरे को देखते हुए भारी सुक्षा व्यवस्था के इंतजाम किए गए हैं।

Original English Conversion : Elaborate security arrangements were made in view of the Presidents visit.

Translated to English : Due to the President's visit, a heavy supervision system has been anticipated.
Average BL

In [15]:
average_bleu_score = np.mean(facebook_bleu_scores)
average_gleu_score = np.mean(facebook_gleu_scores)
average_meteor_score = np.mean(facebook_meteor_scores)
print('Average BLEU Score for Test Set using Facebooks model: ', average_bleu_score)
print('Average GLEU Score for Test Set using Facebooks model: ', average_gleu_score)
print('Average METEOR Score for Test Set using Facebooks model: ', average_meteor_score)

Average BLEU Score for Test Set using Facebooks model:  0.08107246167656472
Average GLEU Score for Test Set using Facebooks model:  0.18128300433968855
Average METEOR Score for Test Set using Facebooks model:  0.3731271913644258


In [16]:
import accelerate
print(accelerate.__version__)

1.1.1


In [19]:
"""
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = 'jbochi/madlad400-3b-mt'
model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")
tokenizer = T5Tokenizer.from_pretrained(model_name)

google_bleu_scores = []
google_gleu_scores = []
google_meteor_scores = []

for i, hindi_text in enumerate(native_hindi_sentences_dev_set):
    # print(i)
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(input_ids=input_ids)
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reference = [english_sentences_dev_set[i].split()]
    transformed_tokens = translated_text.split()

    bleu_score = sentence_bleu(reference, transformed_tokens)
    gleu_score = sentence_gleu(reference, transformed_tokens)
    met_score = meteor_score(reference, transformed_tokens)

    google_bleu_scores.append(bleu_score)
    google_gleu_scores.append(gleu_score)
    google_meteor_scores.append(met_score)

    if i % 10 == 0:
        print("Translating sentence ", i+1)
        print('Original Hindi text :', hindi_text)
        print('Original English Conversion :', english_sentences_dev_set[i])
        print('Translated to English :', translated_text[0])
        print('Average BLEU score :', np.mean(google_bleu_scores), '\n')
        print('Average GLEU score :', np.mean(google_gleu_scores), '\n')
        print('Average METEOR score :', np.mean(google_meteor_scores), '\n')
        print()
"""

'\nfrom transformers import T5ForConditionalGeneration, T5Tokenizer\n\nmodel_name = \'jbochi/madlad400-3b-mt\'\nmodel = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto")\ntokenizer = T5Tokenizer.from_pretrained(model_name)\n\ngoogle_bleu_scores = []\ngoogle_gleu_scores = []\ngoogle_meteor_scores = []\n\nfor i, hindi_text in enumerate(native_hindi_sentences_dev_set):\n    # print(i)\n    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)\n    outputs = model.generate(input_ids=input_ids)\n    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n    reference = [english_sentences_dev_set[i].split()]\n    transformed_tokens = translated_text.split()\n\n    bleu_score = sentence_bleu(reference, transformed_tokens)\n    gleu_score = sentence_gleu(reference, transformed_tokens)\n    met_score = meteor_score(reference, transformed_tokens)\n\n    google_bleu_scores.append(bleu_score)\n    google_gleu_scores.append(gleu_score

In [20]:
"""
average_bleu_score = np.mean(google_bleu_scores)
average_gleu_score = np.mean(google_gleu_scores)
average_meteor_score = np.mean(google_meteor_scores)
print('Average BLEU Score for Test Set using Googles model: ', average_bleu_score)
print('Average GLEU Score for Test Set using Googles model: ', average_gleu_score)
print('Average METEOR Score for Test Set using Googles model: ', average_meteor_score)
"""

"\naverage_bleu_score = np.mean(google_bleu_scores)\naverage_gleu_score = np.mean(google_gleu_scores)\naverage_meteor_score = np.mean(google_meteor_scores)\nprint('Average BLEU Score for Test Set using Googles model: ', average_bleu_score)\nprint('Average GLEU Score for Test Set using Googles model: ', average_gleu_score)\nprint('Average METEOR Score for Test Set using Googles model: ', average_meteor_score)\n"

In [21]:
import torch
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor


model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

ip = IndicProcessor(inference=True)

input_sentences = [
    "जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।",
    "हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।",
    "अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।",
    "मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।",
]

src_lang, tgt_lang = "hin_Deva", "eng_Latn"

batch = ip.preprocess_batch(
    input_sentences,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenize the sentences and generate input encodings
inputs = tokenizer(
    batch,
    truncation=True,
    padding="longest",
    return_tensors="pt",
    return_attention_mask=True,
).to(DEVICE)

# Generate translations using the model
with torch.no_grad():
    generated_tokens = model.generate(
        **inputs,
        use_cache=True,
        min_length=0,
        max_length=256,
        num_beams=5,
        num_return_sequences=1,
    )

# Decode the generated tokens into text
with tokenizer.as_target_tokenizer():
    generated_tokens = tokenizer.batch_decode(
        generated_tokens.detach().cpu().tolist(),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )

# Postprocess the translations, including entity replacement
translations = ip.postprocess_batch(generated_tokens, lang=tgt_lang)

for input_sentence, translation in zip(input_sentences, translations):
    print(f"{src_lang}: {input_sentence}")
    print(f"{tgt_lang}: {translation}")


hin_Deva: जब मैं छोटा था, मैं हर रोज़ पार्क जाता था।
eng_Latn: When I was young, I used to go to the park every day.
hin_Deva: हमने पिछले सप्ताह एक नई फिल्म देखी जो कि बहुत प्रेरणादायक थी।
eng_Latn: We saw a new movie last week that was very inspiring.
hin_Deva: अगर तुम मुझे उस समय पास मिलते, तो हम बाहर खाना खाने चलते।
eng_Latn: If you'd given me a pass at that time, we'd have gone out to eat.
hin_Deva: मेरे मित्र ने मुझे उसके जन्मदिन की पार्टी में बुलाया है, और मैं उसे एक तोहफा दूंगा।
eng_Latn: My friend has invited me to her birthday party, and I'll give her a present.




In [30]:
import torch
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor

# Initialize metrics
indict2_bleu_scores = []
indict2_gleu_scores = []
indict2_meteor_scores = []

# Model and tokenizer setup
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "hin_Deva", "eng_Latn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Convert numpy array to a list
native_hindi_sentences = native_hindi_sentences_dev_set.tolist()
english_sentences = english_sentences_dev_set.tolist()

# Ensure the datasets are aligned
assert len(native_hindi_sentences) == len(english_sentences), "Dev sets must be of equal length."

for i, hindi_text in enumerate(native_hindi_sentences):
    try:
        # print(f"Processing sentence {i+1}...")
        
        # Preprocessing
        batch = ip.preprocess_batch(
            [hindi_text],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )
        # print(f"Preprocessed batch: {batch}")
        
        # Tokenize the sentences and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=1,
                num_return_sequences=1,
            )
        # print(f"Generated tokens: {generated_tokens}")

        # Decode the generated tokens into text
        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
        translated_text = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0]
        # print(f"Translated text: {translated_text}")

        reference = [english_sentences_dev_set[i].split()]
        transformed_tokens = translated_text.split()

        bleu_score = sentence_bleu(reference, transformed_tokens)
        gleu_score = sentence_gleu(reference, transformed_tokens)
        met_score = meteor_score(reference, transformed_tokens)

        indict2_bleu_scores.append(bleu_score)
        indict2_gleu_scores.append(gleu_score)
        indict2_meteor_scores.append(met_score)

        # Log progress every 10 sentences
        if i % 10 == 0:
            print("Translating sentence", i + 1)
            print("Original Hindi text:", hindi_text)
            print("Original English Conversion:", english_sentences[i])
            print("Translated to English:", translated_text)
            print("Average BLEU score:", np.mean(indict2_bleu_scores))
            print("Average GLEU score:", np.mean(indict2_gleu_scores))
            print("Average METEOR score:", np.mean(indict2_meteor_scores))
            print()

    except Exception as e:
        print(f"Error processing sentence {i+1}: {e}")
        continue


Translating sentence 1
Original Hindi text: इसपर हमारा रूख स्पष्ट और अटल है।

Original English Conversion: Our position on this issue has been clear and consistent.

Translated to English: Our position on this is clear and consistent.
Average BLEU score: 0.38940039153570244
Average GLEU score: 0.47058823529411764
Average METEOR score: 0.7059558517284465

Translating sentence 11
Original Hindi text: अतएव, मन का सर्वप्रथम कार्य तो इन्द्रियानुभव को व्यवस्थित करना है; इसके साथ ही वह देहस्थित चेतना के संकल्प की स्वाभाविक प्रतिक्रियाओं व्यवस्थित करता है और शरीर को एक साधन के रूप में प्रयुक्त करता है, अथवा, जैसा कि साधारणतया कहा जाता है, वह कर्मेन्द्रियों को अपने करण के रूप में प्रयुक्त करता है।

Original English Conversion: The manas is therefore in the first place an organiser of sense experience. in addition it organises the natural reactions of the will in the embodied consciousness and uses the body as an instrument, uses, as it is ordinarily put, the organs of action.

Translated to Eng

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence 21
Original Hindi text: तुम्हारें माल और तुम्हारे प्राण में तुम्हारी परीक्षा होकर रहेगी और तुम्हें उन लोगों से जिन्हें तुमसे पहले किताब प्रदान की गई थी और उन लोगों से जिन्होंने 'शिर्क' किया, बहुत-सी कष्टप्रद बातें सुननी पड़ेगी। परन्तु यदि तुम जमें रहे और (अल्लाह का) डर रखा, तो यह उन कर्मों में से है जो आवश्यक ठहरा दिया गया है

Original English Conversion: You will, nonetheless, be tried with your wealth and life, and will hear many untoward things from the followers of former Books and the infidels. But if you endure with patience and follow the straight path, it will surely (accord) with God's fixed resolve about human affairs.

Translated to English: You will be tested in your wealth and your lives, and you will hear many painful things from those who were given the Book before you and from those who associated others with it. But if you remain steadfast and fear Allah, then this is one of the things prescribed.
Average BLEU score: 0.15443732955839806
Average GLE

In [31]:
average_bleu_score = np.mean(indict2_bleu_scores)
average_gleu_score = np.mean(indict2_gleu_scores)
average_meteor_score = np.mean(indict2_meteor_scores)
print('Average BLEU Score for Test Set using IndicTrans2 model: ', average_bleu_score)
print('Average GLEU Score for Test Set using IndicTrans2 model: ', average_gleu_score)
print('Average METEOR Score for Test Set using IndicTrans2 model: ', average_meteor_score)

Average BLEU Score for Test Set using IndicTrans2 model:  0.18718207973299422
Average GLEU Score for Test Set using IndicTrans2 model:  0.295210247412076
Average METEOR Score for Test Set using IndicTrans2 model:  0.5222354091508167
