In [1]:
!pip install transformers 




In [2]:
!pip install datasets transformers[sentencepiece] sacrebleu




In [3]:
import transformers
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

In [4]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [5]:
model_path = "Helsinki-NLP/opus-mt-en-hi"
dataset_path = "cfilt/iitb-english-hindi"

In [6]:
train = load_dataset(dataset_path, split= "train[:1%]")
val = load_dataset(dataset_path, split = "test")



In [7]:
train


Dataset({
    features: ['translation'],
    num_rows: 16591
})

In [8]:
val

Dataset({
    features: ['translation'],
    num_rows: 2507
})

In [9]:
train['translation'][:5]


[{'en': 'Give your application an accessibility workout',
  'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'},
 {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'},
 {'en': 'The default plugin layout for the bottom panel',
  'hi': 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'The default plugin layout for the top panel',
  'hi': 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'A list of plugins that are disabled by default',
  'hi': 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है'}]

In [10]:
train['translation']


[{'en': 'Give your application an accessibility workout',
  'hi': 'अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें'},
 {'en': 'Accerciser Accessibility Explorer',
  'hi': 'एक्सेर्साइसर पहुंचनीयता अन्वेषक'},
 {'en': 'The default plugin layout for the bottom panel',
  'hi': 'निचले पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'The default plugin layout for the top panel',
  'hi': 'ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका'},
 {'en': 'A list of plugins that are disabled by default',
  'hi': 'उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से निष्क्रिय किया गया है'},
 {'en': 'Highlight duration', 'hi': 'अवधि को हाइलाइट रकें'},
 {'en': 'The duration of the highlight box when selecting accessible nodes',
  'hi': 'पहुंचनीय आसंधि (नोड) को चुनते समय हाइलाइट बक्से की अवधि'},
 {'en': 'Highlight border color',
  'hi': 'सीमांत (बोर्डर) के रंग को हाइलाइट करें'},
 {'en': 'The color and opacity of the highlight border.',
  'hi': 'हाइलाइट किए गए सीमांत का रंग और अपारदर्शिता। '},
 {'en': 'Highlight fill color', 'hi': 

In [11]:
val['translation'][:5]


[{'en': 'A black box in your car?', 'hi': 'आपकी कार में ब्लैक बॉक्स?'},
 {'en': "As America's road planners struggle to find the cash to mend a crumbling highway system, many are beginning to see a solution in a little black box that fits neatly by the dashboard of your car.",
  'hi': 'जबकि अमेरिका के सड़क योजनाकार, ध्वस्त होते हुए हाईवे सिस्टम को सुधारने के लिए धन की कमी से जूझ रहे हैं, वहीं बहुत-से लोग इसका समाधान छोटे से ब्लैक बॉक्स में देख रहे हैं, जो आपकी कार के डैशबोर्ड पर सफ़ाई से फिट हो जाता है।'},
 {'en': "The devices, which track every mile a motorist drives and transmit that information to bureaucrats, are at the center of a controversial attempt in Washington and state planning offices to overhaul the outdated system for funding America's major roads.",
  'hi': 'यह डिवाइस, जो मोटर-चालक द्वारा वाहन चलाए गए प्रत्येक मील को ट्रैक करती है तथा उस सूचना को अधिकारियों को संचारित करती है, आजकल अमेरिका की प्रमुख सड़कों का वित्त-पोषण करने के लिए पुराने हो चुके सिस्टम का जीर्णोद्धार क

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_path)




In [13]:
tokenizer("My name is Anubhav Gupta and I'm from Ayodhya !")


{'input_ids': [633, 300, 23, 1406, 8258, 6273, 2259, 2797, 3958, 7595, 10, 56, 70, 363, 72, 238, 26134, 25948, 15908, 44, 61, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
def preprocessing(examples):
  inputs = [x['en'] for x in examples['translation']]
  targets = [x['hi'] for x in examples['translation']]
  eng_hin = tokenizer(inputs,  truncation=True)

  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, truncation=True)

  eng_hin['labels'] = labels['input_ids']

  return eng_hin

In [15]:
train_data = train.map(preprocessing, batched = True)


Map:   0%|          | 0/16591 [00:00<?, ? examples/s]



In [16]:
val_data = val.map(preprocessing, batched = True)


In [17]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model, return_tensors="tf")

In [19]:
train_dataset = model.prepare_tf_dataset(train_data, batch_size= 32, shuffle=True, tokenizer=tokenizer, collate_fn= data_collator)

In [20]:
val_dataset = model.prepare_tf_dataset(val_data, batch_size= 32, shuffle=True, tokenizer=tokenizer, collate_fn= data_collator)

In [21]:
train_dataset

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(32, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(32, None), dtype=tf.int64, name=None), 'decoder_input_ids': TensorSpec(shape=(32, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(32, None), dtype=tf.int64, name=None))>

In [22]:
val_dataset

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(32, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(32, None), dtype=tf.int64, name=None), 'decoder_input_ids': TensorSpec(shape=(32, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(32, None), dtype=tf.int64, name=None))>

In [23]:
opt = transformers.AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [24]:
model.compile(optimizer=opt)


In [26]:
model.fit(x = train_dataset, validation_data= val_dataset ,epochs=1, verbose=True)




<keras.src.callbacks.History at 0x7df4b8f070a0>

In [27]:
model.save_pretrained("/content/drive/MyDrive/model.keras")


In [28]:
tokens = AutoTokenizer.from_pretrained(model_path)
new_model = TFAutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/model.keras")


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at /content/drive/MyDrive/model.keras.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [29]:
# test input
test_inputs = "My name is Anubhav Gupta."
inp = tokens([test_inputs], return_tensors="np")
out = new_model.generate(**inp)

with tokens.as_target_tokenizer():
    print(tokens.decode(out[0]))

<pad> मेरा नाम Abava है.</s> <pad> <pad>




In [30]:
# Dummy data for testing our fine tuned model
inputs_list = [
    "The sky is clear, and the sun is shining brightly.",
    "I am currently studying algorithms for artificial intelligence.",
    "Yesterday, I went to the library to borrow some books.",
    "The conference on machine learning is scheduled for next week.",
    "She is passionate about exploring new technologies.",
    "The project deadline has been extended by a week.",
    "The scientific community is making significant advancements in AI research.",
    "I enjoy taking long walks in the park during the evening.",
    "The latest software update includes several important security fixes.",
    "The professor provided valuable feedback on our research paper.",
]

# Tokenize and generate translations
for input_text in inputs_list:
    inp = tokens(input_text, return_tensors="np", padding=True, truncation=True)
    out = new_model.generate(**inp)

    with tokens.as_target_tokenizer():
        translation = tokens.decode(out[0], skip_special_tokens=True)

    print(f"Input: {input_text}")
    print(f"Translation: {translation}\n")


Input: The sky is clear, and the sun is shining brightly.
Translation: और आसमान खोल दिए जाएँगे

Input: I am currently studying algorithms for artificial intelligence.
Translation: मैं अभी कृत्रिम बुद्धि के लिए एल्गोरिदम का अध्ययन कर रहा हूँ.

Input: Yesterday, I went to the library to borrow some books.
Translation: कल, मैं पुस्तकालय में कुछ किताबें उधार लेने के लिए गया.

Input: The conference on machine learning is scheduled for next week.
Translation: मशीन सीखने पर सम्मेलन अगले सप्ताह के लिए नियत किया गया है.

Input: She is passionate about exploring new technologies.
Translation: वह नई तकनीकों की खोज के बारे में भावुक है.

Input: The project deadline has been extended by a week.
Translation: इस परियोजना को एक सप्ताह से जारी रखा गया है.

Input: The scientific community is making significant advancements in AI research.
Translation: वैज्ञानिक समुदाय एआई अनुसंधान में उल्लेखनीय उन्‍नति कर रहा है ।

Input: I enjoy taking long walks in the park during the evening.
Translation: मैं शाम के 

In [31]:
test_inputs_list = []
references_list = []

for i in range(len(test_inputs_list)):
  test_inputs_list = val['translation'][i]['en']
  references_list = val['translation'][i]['hi']

In [33]:
from nltk.translate.bleu_score import corpus_bleu

translations_list = []

# Tokenize and generate translations for each test input
for test_inputs, reference_translations in zip(test_inputs_list, references_list):
    inp = tokens([test_inputs], return_tensors="np")
    out = new_model.generate(**inp)

    with tokens.as_target_tokenizer():
        translation = tokens.decode(out[0], skip_special_tokens=True)
        translations_list.append([translation])

# Calculate BLEU score
bleu_score = corpus_bleu(references_list, translations_list)
print("BLEU Score:", bleu_score)


BLEU Score: 35.17202918301
