In [1]:
# Import necessary libraries
import transformers
from transformers import MarianMTModel,MarianTokenizer

In [2]:
# Load the Helsinki-NLP/opus-mt-en-vi Model and tokenizer
model_name = 'Helsinki-NLP/opus-mt-es-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

### Input a single sentence

In [3]:
# Input single sentence 
input_text="Todavia recuerdo aquel amanecer en que mi padre me llevo por primera vez a visitar el Cemeterio de los Libros Ollvidados"

In [4]:
# Tokenize the input text ( convert source language into input tokens)
# the input tokens are returned in a format that can be used by the model directly. 
# In the output the input_ids represent a word or subword from the sentence
 
input_tokens = tokenizer(input_text, return_tensors="pt",padding=True)
print(input_tokens)

{'input_ids': tensor([[ 7126,  3274,  8733,  5775, 29736,    12,    15,   155,  1619,    74,
         19390,    36,   749,   259,     8,  4998,    14,  6289, 18878,  3187,
             4,    17, 27735,   425,   210, 19350,   503,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}


In [5]:
# Generate token IDs in the target language using the input tokens of source language
translated=model.generate(**(input_tokens))
print(translated)

tensor([[65000,    33,   616,  2743,    27, 24423,   208,   125,  2110,  1231,
            74,    23,     5,   269,   158,    13,  2080,     5, 55221,     7,
             5, 21778,  1513, 23462,     0]])


In [6]:
# Decode : Convert the translated token IDs back to text
output_text=[tokenizer.decode(t,skip_special_tokens=True) for t in translated]
print("Translated Text:", output_text)

Translated Text: ['I still remember that dawn when my father took me for the first time to visit the Cemetery of the Forgotten Books']


### Multiple Input Sentences

In [7]:
# padding ensures that all sentences in a batch are of same length which is a requirement for batch processing using transformers.

input_text="Todavia recuerdo aquel amanecer en que mi padre me llevo por primera vez a visitar el Cemeterio de los Libros Ollvidados", " A ella puedes contarselo todo", "Poco despues de la guerra civil, un brote de colera se habia llevado a mi madre."

In [8]:
input_tokens = tokenizer(input_text, return_tensors="pt",padding=True)
print(input_tokens)
# In the output, the attention_mask:has values 0 and 1. 
# Attention mask of 0 indicates that it is a padding mask and should be ignored.
# Attention mask of 1 indicates that the sequence has to be attached.
# When a single input sentence was given the input tokens had attention mask walue of 1 only
# When two input texts of differing length was given, we get mask values of 0 and 1.

{'input_ids': tensor([[ 7126,  3274,  8733,  5775, 29736,    12,    15,   155,  1619,    74,
         19390,    36,   749,   259,     8,  4998,    14,  6289, 18878,  3187,
             4,    17, 27735,   425,   210, 19350,   503,     0],
        [   70,   668,  1534,  4120, 24728,   163,     0, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000],
        [15876, 28786,     4,     6,  1662,   647,     2,    28, 35847,     4,
          7624,  2910,    26, 30638,  6130,     8,   155,  1898,     3,     0,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       

In [9]:
# Generate token IDs in the target language using the input tokens of source language
translated=model.generate(**(input_tokens))
print(translated)

tensor([[65000,    33,   616,  2743,    27, 24423,   208,   125,  2110,  1231,
            74,    23,     5,   269,   158,    13,  2080,     5, 55221,     7,
             5, 21778,  1513, 23462,     0],
        [65000,    99,    88,   922,   225,  1645,     3,     0, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000],
        [65000, 42912,   421,     5,   647,  1733,     2,    76, 21081,     7,
          7624,  2910,   115,   621,   125,  2195,     3,     0, 65000, 65000,
         65000, 65000, 65000, 65000, 65000]])


In [10]:
# Decode: Convert the translated token IDs back to text
output_text=[tokenizer.decode(t,skip_special_tokens=True) for t in translated]

In [11]:
print("Translated Text:", output_text)

Translated Text: ['I still remember that dawn when my father took me for the first time to visit the Cemetery of the Forgotten Books', 'You can tell her everything.', 'Shortly after the civil war, an outbreak of colera had taken my mother.']
