In [2]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.10.1 sacrebleu-2.4.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import pandas as pd
import sacrebleu
from transformers import MarianMTModel, MarianTokenizer, MBartForConditionalGeneration, MBart50TokenizerFast
from tqdm import tqdm    

In [4]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [5]:
# Load your dataset
dataset_path = '/kaggle/input/hindienglishdata/Hindi_English_Truncated_Corpus.csv'
data = pd.read_csv(dataset_path)

In [6]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [7]:
data.isnull().sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

In [8]:
# Drop rows where the English sentence is missing or not a valid string
data = data.dropna(subset=['english_sentence', 'hindi_sentence'])
data = data[data['english_sentence'].apply(lambda x: isinstance(x, str) and x.strip() != '')]


In [8]:
# Load MarianMT model and tokenizer for English to Hindi translation
marian_model_name = "Helsinki-NLP/opus-mt-en-hi"
marian_tokenizer = MarianTokenizer.from_pretrained(marian_model_name)
marian_model = MarianMTModel.from_pretrained(marian_model_name).to(device)


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [11]:
def translate_marian_batch(texts, max_length=512):
    inputs = marian_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    with torch.no_grad():
        translated = marian_model.generate(**inputs)
    return marian_tokenizer.batch_decode(translated, skip_special_tokens=True)

In [13]:
translations_marian = []
reference_hindi_sentences = []

In [14]:
batch_size = 32

for i in tqdm(range(0, len(data), batch_size), desc="Translating"):
    batch_data = data.iloc[i:i+batch_size]
    
    # Get the English sentences and reference Hindi sentences
    english_sentences = batch_data['english_sentence'].tolist()
    reference_hindi_batch = batch_data['hindi_sentence'].tolist()
    
    # Translate the batch using MarianMT model
    translation_marian_batch = translate_marian_batch(english_sentences)
    
    # Append translations and reference sentences
    translations_marian.extend(translation_marian_batch)
    reference_hindi_sentences.extend(reference_hindi_batch)

Translating: 100%|██████████| 3988/3988 [2:30:47<00:00,  2.27s/it]  


In [15]:
bleu_marian = sacrebleu.corpus_bleu(translations_marian, [reference_hindi_sentences])

In [16]:
print(f"BLEU Score for MarianMT Model ({marian_model_name}): {bleu_marian.score}")

BLEU Score for MarianMT Model (Helsinki-NLP/opus-mt-en-hi): 7.127563539913191


In [18]:
print("\nDisplaying 10 sample translations:")
for idx in range(10):
    print(f"Sample {idx + 1}:")
    print(f"English Sentence: {data.iloc[idx]['english_sentence']}")
    print(f"Actual Hindi Sentence: {reference_hindi_sentences[idx]}")
    print(f"Predicted Hindi Sentence: {translations_marian[idx]}")
    print('-' * 80)


Displaying 10 sample translations:
Sample 1:
English Sentence: politicians do not have permission to do what needs to be done.
Actual Hindi Sentence: राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .
Predicted Hindi Sentence: नेताओं को क्या करने की अनुमति नहीं है ।
--------------------------------------------------------------------------------
Sample 2:
English Sentence: I'd like to tell you about one such child,
Actual Hindi Sentence: मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,
Predicted Hindi Sentence: मैं तुम्हें एक ऐसे बच्चे के बारे में बताना चाहते हैं,
--------------------------------------------------------------------------------
Sample 3:
English Sentence: This percentage is even greater than the percentage in India.
Actual Hindi Sentence: यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
Predicted Hindi Sentence: यह प्रतिशत भारत में प्रतिशत से भी बड़ा है.
--------------------------------------------------------------------------------
Sample 4:
Engli

In [9]:
mbart_model_name = "facebook/mbart-large-50-many-to-many-mmt"  
model = MBartForConditionalGeneration.from_pretrained(mbart_model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]



In [10]:
tokenizer.src_lang = "en_XX" 
target_lang = "hi_IN" 

In [11]:
def translate_batch(batch_sentences, target_language="hi_IN", max_length=128):
    input_texts = [f"translate English to Hindi: " + text for text in batch_sentences]
    
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
    
    model.config.forced_bos_token_id = tokenizer.lang_code_to_id[target_language]
    
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            translated = model.generate(**inputs)
    
    return tokenizer.batch_decode(translated, skip_special_tokens=True)

In [12]:
model = model.to(device)

In [16]:
batch_size = 32 
translations = []
reference_hindi_sentences = []

for i in tqdm(range(0, len(data), batch_size)):
    batch = data['english_sentence'][i:i+batch_size].tolist()
    translations.extend(translate_batch(batch))
    reference_hindi_sentences.extend(data['hindi_sentence'][i:i+batch_size].tolist())

print(f"Sample 1:")
print(f"English Sentence: {data.iloc[0]['english_sentence']}")
print(f"Actual Hindi Sentence: {data.iloc[0]['hindi_sentence']}")
print(f"Predicted by mBART: {translations[0]}")
print('-' * 80)

print(f"Sample {len(data)}:")
print(f"English Sentence: {data.iloc[-1]['english_sentence']}")
print(f"Actual Hindi Sentence: {data.iloc[-1]['hindi_sentence']}")
print(f"Predicted by mBART: {translations[-1]}")
print('-' * 80)

  with torch.cuda.amp.autocast():
100%|██████████| 3988/3988 [5:54:21<00:00,  5.33s/it]  

Sample 1:
English Sentence: politicians do not have permission to do what needs to be done.
Actual Hindi Sentence: राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .
Predicted by mBART: अंग्रेजी से हिन्दी में अनुवादः राजनीतिज्ञों को जो कुछ करना है, वह करने की अनुमति नहीं है।
--------------------------------------------------------------------------------
Sample 127605:
English Sentence: They've just won four government contracts to build off their 100 ambulances,
Actual Hindi Sentence: हाल ही में उन्हें सरकारी ठेका मिला है करीब सौ नई अम्बुलेन्स बनाने का,
Predicted by mBART: अंग्रेज़ी से हिन्दी में अनुवाद करेंः उन्होंने अपने 100 एम्बुलेंस बनाने के लिए अभी चार सरकारी संविदाएं जीती हैं,
--------------------------------------------------------------------------------





In [18]:
bleu_t5 = sacrebleu.corpus_bleu(translations, [reference_hindi_sentences])
print(f"BLEU Score for mBART Model: {bleu_t5.score}")

BLEU Score for mBART Model: 11.51561735505984


In [23]:
print("\nDisplaying 10 sample translations:")
for idx in range(10):
    print(f"Sample {idx + 1}:")
    print(f"English Sentence: {data.iloc[idx]['english_sentence']}")
    print(f"Actual Hindi Sentence: {reference_hindi_sentences[idx]}")
    print(f"Predicted Hindi Sentence: {translations[idx]}")
    print('-' * 80)


Displaying 10 sample translations:
Sample 1:
English Sentence: politicians do not have permission to do what needs to be done.
Actual Hindi Sentence: राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .
Predicted Hindi Sentence: अंग्रेजी से हिन्दी में अनुवादः राजनीतिज्ञों को जो कुछ करना है, वह करने की अनुमति नहीं है।
--------------------------------------------------------------------------------
Sample 2:
English Sentence: I'd like to tell you about one such child,
Actual Hindi Sentence: मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,
Predicted Hindi Sentence: मैं आप को एक ऐसे बच्चे के बारे में बताना चाहूंगा,
--------------------------------------------------------------------------------
Sample 3:
English Sentence: This percentage is even greater than the percentage in India.
Actual Hindi Sentence: यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
Predicted Hindi Sentence: अंग्रेजी से हिन्दी में अनुवादः यह प्रतिशत भारत के प्रतिशत से भी अधिक है।
--------------------