In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv


# **Building a BERT-Based Translation Model with google/mt5-base:**

google/mt5-baseis a pre-trained multilingual model from the mT5 (Multilingual T5) family, designed for text generation tasks like translation, summarization, and question-answering across multiple languages.

In [2]:
#import libraries
import pandas as pd
import unicodedata
import re
from datasets import load_dataset, Dataset, DatasetDict
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from sklearn.model_selection import train_test_split
import torch
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import random

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
MAX_LENGTH=15

# **Preparing Data:**

In [4]:
#The data consists of a large parallel corpus of Hindi and English sentences.
df= pd.read_csv("/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv")

clean the data

In [5]:
df.head()

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default


In [6]:
def unicodeToAscii(s):
  return "".join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c)!='Mn')
def normalizeString_for_eng(s):
    s = str(s)
    s = unicodeToAscii(s.strip())
    s = re.sub(r"[.!?]", "", s)
    s = re.sub(r"[^a-zA-Z0-9]+", r" ", s)
    return s.strip()

def normalizeString_for_hin(s):
  s = str(s)
  hindi_chars = "\u0900-\u097F\u0980-\u09FF a-zA-Z0-9!? |"
  s = re.sub(r"[^" + hindi_chars + "]", r" ", s)
  return s.strip()

list_of_pairs = []
for index, row in df.iterrows():
    english_sentence =   normalizeString_for_eng(row["english"])
    hindi_sentence =  normalizeString_for_hin(row["hindi"])
    if len( english_sentence.split(' ')) < MAX_LENGTH and len(hindi_sentence.split(' '))< MAX_LENGTH:
      sentence_pair = [hindi_sentence, english_sentence]
      list_of_pairs.append(sentence_pair)

Split the cleaned dataset into training, validation, and test sets

In [7]:
train_set, df_temp = train_test_split(list_of_pairs, test_size=0.5, random_state=42)
val_set, test_set = train_test_split(df_temp, test_size=0.2, random_state=42)

put our train and validation in dataset object

In [8]:
# convert the training and validation sets into dictionaries and then into Datasets
dataset = DatasetDict({
    'train': Dataset.from_dict({
        'sentence1': [pair[0] for pair in train_set],
        'sentence2': [pair[1] for pair in train_set]
    }),
    'validation': Dataset.from_dict({
        'sentence1': [pair[0] for pair in val_set],
        'sentence2': [pair[1] for pair in val_set]
    })
})

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2'],
        num_rows: 489369
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2'],
        num_rows: 391495
    })
})

# **Load the model and tokenizer**

In [10]:
model_name = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenize the sentence pair with truncation and padding to the max length

In [11]:
def tokenize_sentences(sentences, tokenizer=tokenizer,max_length=30 ):

  for example in range(1):
    hindi_sentence =sentences["sentence1"]
    english_sentence = sentences['sentence2']
    encoded_pair = tokenizer(
        text=hindi_sentence,
        
        text_target=english_sentence,
        return_tensors="pt",
        max_length=max_length,
        padding="max_length",  
        truncation=True,        
        add_special_tokens=True,
        return_attention_mask=True   
    )

  return {
        'input_ids': encoded_pair['input_ids'].squeeze(),  # Remove extra dimensions
        'attention_mask': encoded_pair['attention_mask'].squeeze(),
        'labels': encoded_pair['labels'].squeeze()
    }

In [12]:
tokenize_sentences(dataset["train"][2])

{'input_ids': tensor([29196, 12946,  1665,     1,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 'labels': tensor([  259, 24191,     1,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0])}

In [13]:
#Applying the tokenize_sentences function to each example in the dataset. 
tokenized_datasets = dataset.map(tokenize_sentences, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/489369 [00:00<?, ? examples/s]

Map:   0%|          | 0/391495 [00:00<?, ? examples/s]

# **Training the model:**

In [15]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',           
    per_device_train_batch_size=40,    
    per_device_eval_batch_size=40,
    num_train_epochs=1,               
    weight_decay=0.01,              
    logging_dir='./logs',             
    logging_steps=10,
    evaluation_strategy="epoch",     
    save_strategy="epoch"             
)
 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'] ,
    eval_dataset=tokenized_datasets['validation'],   
    tokenizer=tokenizer
)
trainer.train()

2024-08-13 18:31:47.055133: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 18:31:47.055276: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 18:31:47.192049: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.762,0.607788


TrainOutput(global_step=12235, training_loss=1.2622004191535416, metrics={'train_runtime': 9251.0847, 'train_samples_per_second': 52.899, 'train_steps_per_second': 1.323, 'total_flos': 3.438140662674432e+16, 'train_loss': 1.2622004191535416, 'epoch': 1.0})

# **Model Evaluation and Metrics:**

Function to Evaluate and Decode Text Using  The Trained   Model

In [28]:
def evaluate(text,model=model, tokenizer=tokenizer, max_length=30 ):
     
    with torch.no_grad():
         
        inputs = tokenizer(
            text, 
            return_tensors="pt", 
            max_length=max_length, 
            padding="max_length", 
            truncation=True
        ).to(device)

         
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask']
        )

        
        predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return predicted_text

Calculate BLEU  for Model Output

In [139]:
# Calculate the average BLEU score for a dataset of sentence pairs.
def BLUE_SCORE(data, batch_size):
    bleu_scores = []
    
   
    data = list(data.values())
    data = list(zip(*data))
    
    num_batches = (len(data) + batch_size - 1) // batch_size   
    
    for i in range(num_batches):
         
        start_index = i * batch_size
        end_index = min(start_index + batch_size, len(data))
        
        batch_data = data[start_index:end_index]   
        
        for input_sentence, target_sentence in batch_data:
            
            output_words = evaluate(input_sentence)
            output_sentence = ' '.join(output_words)
            
            target_tokens = target_sentence.split()
            output_tokens = output_sentence.split()
 
            bleu_score = sentence_bleu([target_tokens], output_tokens)
            bleu_scores.append(bleu_score)
     
    average_bleu_score = sum(bleu_scores) / len(bleu_scores)
    
    return average_bleu_score

In [142]:
average_bleu_score = BLUE_SCORE(dataset["validation"][:10000], 10000)
print(f"The average BLEU score for the evaluated dataset is: {average_bleu_score:.2f}")
 

The average BLEU score for the evaluated dataset is: 0.06


Evaluate Model Predictions on test Set

In [41]:
def evaluate_train(model, n=10):
    for i in range(n):

        pair = random.choice(test_set)
        print('>', pair[0])
        print('=', pair[1])
        output_words= evaluate(  pair[0])

        output_sentence = ''.join(output_words)
        print('<', output_sentence)
        print('')

In [45]:
evaluate_train(model)

> घन
= Cube21
< a

> पूरा दिनः
= All day
< Full Day

> स्नूज़    S
= Snooze
< spool

> काला बाजार
= Black market
< Black market

> आपके पास कोई वर्कशीट नहीं है जिसे सहेजा जा सके
= You do not have a tab that could be saved
< You have no toolbar that can be saved

