In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.meteor_score import meteor_score

In [2]:
import re
from sklearn.model_selection import train_test_split

In [3]:
english_scripts_file_path = "../datasets/Samantar dataset/final_data/en-hi/train.en"
native_scripts_file_path = "../datasets/Samantar dataset/final_data/en-hi/train.hi"

In [4]:
native_lines = []
with open(native_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        native_lines.append(line)

english_lines = []
with open(english_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        english_lines.append(line)

In [5]:
df = pd.DataFrame({'Hindi': native_lines, 'English': english_lines})
print(df.head())

                                               Hindi  \
0        जिसके जवाब में पाक ने अच्छी शुरुआत की थी.\n   
1  यूरोपीय संघ के महत्वपूर्ण संस्थानों में यूरोपि...   
2  कांग्रेस नेता तमिलनाडु से शिवगंगा लोकसभा क्षेत...   
3  संबंधन प्रयास के बारे में उपयोक्ता को प्रांप्ट...   
4  वित्त मंत्री ने घोषणा कि जमा बीमा और ऋण गारंटी...   

                                             English  
0     In reply, Pakistan got off to a solid start.\n  
1  The European Union has seven principal decisio...  
2  The Congress leader represents Sivaganga Lok S...  
3        Prompt the user about connection attempts\n  
4  Further, the Minister announced that Deposit I...  


In [6]:
test_set = df.sample(n = 1000, random_state = 1234)
print(f"Test set size: {len(test_set)}")
print(f"Dev set size: {len(test_set)}")

Test set size: 1000
Dev set size: 1000


In [7]:
native_hindi_sentences_test_set = test_set['Hindi'].values
print(type(native_hindi_sentences_test_set))
print(native_hindi_sentences_test_set.shape)
english_sentences_test_set = test_set['English'].values

<class 'numpy.ndarray'>
(1000,)


In [8]:
import torch
import numpy as np
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
)
from IndicTransToolkit import IndicProcessor

# Initialize metrics
indict2_bleu_scores = []
indict2_gleu_scores = []
indict2_meteor_scores = []

# Model and tokenizer setup
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True).to("cuda" if torch.cuda.is_available() else "cpu")
ip = IndicProcessor(inference=True)

src_lang, tgt_lang = "hin_Deva", "eng_Latn"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Convert numpy array to a list
native_hindi_sentences = native_hindi_sentences_test_set.tolist()
english_sentences = english_sentences_test_set.tolist()

# Ensure the datasets are aligned
assert len(native_hindi_sentences) == len(english_sentences), "Dev sets must be of equal length."

for i, hindi_text in enumerate(native_hindi_sentences):
    try:
        # print(f"Processing sentence {i+1}...")
        
        # Preprocessing
        batch = ip.preprocess_batch(
            [hindi_text],
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )
        # print(f"Preprocessed batch: {batch}")
        
        # Tokenize the sentences and generate input encodings
        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        # Generate translations using the model
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=1,
                num_return_sequences=1,
            )
        # print(f"Generated tokens: {generated_tokens}")

        # Decode the generated tokens into text
        with tokenizer.as_target_tokenizer():
            generated_tokens = tokenizer.batch_decode(
                generated_tokens.detach().cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
        translated_text = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0]
        # print(f"Translated text: {translated_text}")

        reference = [english_sentences[i].split()]
        transformed_tokens = translated_text.split()

        bleu_score = sentence_bleu(reference, transformed_tokens)
        gleu_score = sentence_gleu(reference, transformed_tokens)
        met_score = meteor_score(reference, transformed_tokens)

        indict2_bleu_scores.append(bleu_score)
        indict2_gleu_scores.append(gleu_score)
        indict2_meteor_scores.append(met_score)

        # Log progress every 10 sentences
        if i % 10 == 0:
            print("Translating sentence", i + 1)
            print("Original Hindi text:", hindi_text)
            print("Original English Conversion:", english_sentences[i])
            print("Translated to English:", translated_text)
            print("Average BLEU score:", np.mean(indict2_bleu_scores))
            print("Average GLEU score:", np.mean(indict2_gleu_scores))
            print("Average METEOR score:", np.mean(indict2_meteor_scores))
            print()

    except Exception as e:
        print(f"Error processing sentence {i+1}: {e}")
        continue


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence 1
Original Hindi text: उसे इलाज के लिए फौरन अस्पताल में एडमिट कराया गया है.

Original English Conversion: He was immediately admitted to a hospital for treatment.

Translated to English: He was immediately taken to hospital for treatment.
Average BLEU score: 6.886705081558736e-78
Average GLEU score: 0.43333333333333335
Average METEOR score: 0.8918539325842695



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Translating sentence 11
Original Hindi text: उनके दूसरे तथा तीसरे कविता संग्रह बर्ड आफ टाइम तथा ब्रोकन विंग ने उन्हें एक सुप्रसिद्ध कवयित्री बना दिया।

Original English Conversion: Her second and third collection of poems, 'Bird of Time and' Broken Wing 'made a very famous poetess.

Translated to English: Her second and third collections of poems, Bird of Time and Broken Wing, made her a well-known poetess.
Average BLEU score: 0.23846831354705333
Average GLEU score: 0.3716099075929943
Average METEOR score: 0.5255110906977684

Translating sentence 21
Original Hindi text: उसे आपकी बहुत जरूरत है।

Original English Conversion: He needs you.

Translated to English: He needs you so much.
Average BLEU score: 0.20806433796393395
Average GLEU score: 0.3487143096106563
Average METEOR score: 0.5648239833583856

Translating sentence 31
Original Hindi text: यह एक काम है जो मुझे करना ही है, सो मैं करती हूँ ।

Original English Conversion: Its something I must do, so I do it.

Translated to English: I

In [9]:
average_bleu_score = np.mean(indict2_bleu_scores)
average_gleu_score = np.mean(indict2_gleu_scores)
average_meteor_score = np.mean(indict2_meteor_scores)
print('Average BLEU Score for Dev Set using IndicTrans2 model: ', average_bleu_score)
print('Average GLEU Score for Dev Set using IndicTrans2 model: ', average_gleu_score)
print('Average METEOR Score for Dev Set using IndicTrans2 model: ', average_meteor_score)

Average BLEU Score for Dev Set using IndicTrans2 model:  0.20708175845134716
Average GLEU Score for Dev Set using IndicTrans2 model:  0.31121107065543363
Average METEOR Score for Dev Set using IndicTrans2 model:  0.5322711268426247
