In [57]:
import pandas as pd

In [58]:
data = pd.read_csv('/content/task3dataset.csv')
# Create a dictionary to map column names
column_mapping = {
    'english': 'en',
    'hindi': 'hi'
}

# Rename the DataFrame columns
data.rename(columns=column_mapping, inplace=True)
data

Unnamed: 0,en,hi
0,Add a new weekly reminder for Sunday Brunch at...,9 : 30 am ko Sunday Brunch के के लिए मैं नया w...
1,message danny and see if he wants to go to com...,डैनी ko message karo मैं और देखो के he चाहता ह...
2,set alarm for 2 hours,do घंटा के के लिए alarm set कार्डो
3,kill the reminder for baking a cake for neil,नील के के लिए cake bake करने के reminder ko मी...
4,retrieve my chat requests please,Please mere chat अनुरोध ko retrieve करे
...,...,...
10891,outdoor concerts this summer,Is summer ko outdoor संगीत कार्यक्रम
10892,I ' d like you to snooze my alarm for 10 minut...,Mai चाहता हू की एएपी mere alarm ko 10 मिनट के ...
10893,Please play Tupac,please Tupac play करे
10894,Message Rhonda Ask her details about Vacation ...,Rhonda ko message करे मैं और July me Vacation ...


In [59]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

In [60]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

In [61]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

In [62]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from datasets import Dataset, DatasetDict

In [63]:
# Split your DataFrame into train, validation, and test sets
train_data = data.sample(frac=0.7, random_state=42)
remaining_data = data.drop(train_data.index)
valid_data = remaining_data.sample(frac=0.5, random_state=42)
test_data = remaining_data.drop(valid_data.index)

# Create Dataset objects from DataFrames
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)
test_dataset = Dataset.from_pandas(test_data)

# Create a DatasetDict object
raw_datasets = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})


In [64]:
raw_datasets = DatasetDict({
    'train': train_dataset.remove_columns('__index_level_0__'),
    'validation': valid_dataset.remove_columns('__index_level_0__'),
    'test': test_dataset.remove_columns('__index_level_0__')
})
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['en', 'hi'],
        num_rows: 7627
    })
    validation: Dataset({
        features: ['en', 'hi'],
        num_rows: 1634
    })
    test: Dataset({
        features: ['en', 'hi'],
        num_rows: 1635
    })
})

In [65]:
raw_datasets['train'][1]

{'en': 'Can you remind me to send Nina a card',
 'hi': 'क्या एपी मुझे Nina ko card भेजने के लिये याद दिला कृपया है'}

In [66]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [67]:
max_input_length = 128
max_target_length = 128

source_lang = "en"
target_lang = "hi"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [68]:
preprocess_function(raw_datasets["train"][:2])



{'input_ids': [[1636, 52, 2831, 765, 44, 22, 0], [701, 27, 8457, 156, 7, 986, 55002, 19, 3215, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[2419, 667, 292, 353, 8648, 628, 44, 22, 0], [65, 19708, 130, 208, 3634, 568, 292, 44, 6915, 3869, 232, 2776, 6, 310, 419, 12421, 681, 5, 0]]}

In [69]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/7627 [00:00<?, ? examples/s]

Map:   0%|          | 0/1634 [00:00<?, ? examples/s]

Map:   0%|          | 0/1635 [00:00<?, ? examples/s]

In [70]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['en', 'hi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7627
    })
    validation: Dataset({
        features: ['en', 'hi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1634
    })
    test: Dataset({
        features: ['en', 'hi', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1635
    })
})

In [71]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-hi.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [72]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 30

In [73]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [74]:
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [75]:
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

In [76]:
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [77]:
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    batch_size=8,
    shuffle=False,
    collate_fn=generation_data_collator,
)

In [78]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [79]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7d5439c98130>

In [80]:
model.save_pretrained("translation_model")

TESTING


In [81]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("translation_model")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at translation_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [87]:
input_text  = " I was waiting for my bag"

tokenized = tokenizer([input_text], return_tensors='np')
out = model.generate(**tokenized, max_length=128)
print(out)

tf.Tensor(
[[61949    44 32934    63   801   292  1328  2326   667     6   310  6973
     57   153    82     0]], shape=(1, 16), dtype=int32)


In [None]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

In [88]:
with tokenizer.as_target_tokenizer():
    print(tokenizer.decode(out[0], skip_special_tokens=True))

माई अपने baghy के लिये इंतजार कर रहा था


In [92]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

def translate_text(input_text, model_checkpoint="Helsinki-NLP/opus-mt-en-hi", max_length=128):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = TFAutoModelForSeq2SeqLM.from_pretrained('translation_model')

    # Tokenize the input text
    tokenized = tokenizer([input_text], return_tensors='np')

    # Generate translation
    out = model.generate(**tokenized, max_length=max_length)

    # Decode and return the translated text
    with tokenizer.as_target_tokenizer():
        translated_text = tokenizer.decode(out[0], skip_special_tokens=True)

    return translated_text

def main():
    input_text = input("Enter the text you want to translate: ")
    translated_text = translate_text(input_text)
    print("\nTranslated Text:", translated_text)

if __name__ == "__main__":
    main()


Enter the text you want to translate: So even if it's a big video, I will clearly mention all the products.


All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at translation_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


Translated Text: तो ye भी agley se video है, तो माई सभी traffic के बारे में me सबको बताुंगा.
