# **Building a Transformer with PyTorch**
### Crearemos un modelo de Transformer utilizando PyTorch, una potente herramienta de aprendizaje automático moderno.

https://www.datacamp.com/tutorial/building-a-transformer-with-py-torch

In [None]:
! pip3 install torch torchvision torchaudio



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

## Los parametros de multiple atencion.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Se asegura de que la dimensión del modelo (d_model) sea divisible por el número de cabezas.
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        # Inicializar dimensiones
        self.d_model = d_model # Dimension del modelo
        self.num_heads = num_heads # Numero de cabezas
        self.d_k = d_model // num_heads # Dimensión de las claves, consulta y valores de cada cabeza

        # Capas lineales para transformar entradas.
        self.W_q = nn.Linear(d_model, d_model) # Transformación de consultas
        self.W_k = nn.Linear(d_model, d_model) # Transformación de claves
        self.W_v = nn.Linear(d_model, d_model) # Transformacion de Valores
        self.W_o = nn.Linear(d_model, d_model) # Transformacion de Salida

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calcula puntuaciones de atención
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        # Aplica máscaras si se proporcionan (útil para evitar que se preste atención a ciertas partes)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Se aplica Softmax para obtener probabilidades de atención
        attn_probs = torch.softmax(attn_scores, dim=-1)

        # Multiplicar por valores para obtener el resultado final.
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        # Cambia la forma de la entrada para que num_heads genere atención para múltiples cabezas
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Combine las múltiples cabezas nuevamente a su forma original.
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Aplicar transformaciones lineales y cabezas divididas.
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # Da atención a productos escalados
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Combina las cabezas y aplica la transformación de salida.
        output = self.W_o(self.combine_heads(attn_output))
        return output

## Algoritmo FeedForward

In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()

        # Define dos capas lineales para la transformación anticipada.
        self.fc1 = nn.Linear(d_model, d_ff)  # Capa 1 completamente conectada
        self.fc2 = nn.Linear(d_ff, d_model)  # Capa 2 completamente conectada

        # Función de activación para introducir no linealidad.
        self.relu = nn.ReLU()

    def forward(self, x):
        # Paso directo a través de la red feedforward:
         # 1. Aplicar la primera capa lineal (fc1)
         # 2. Aplicar la función de activación Rectified Linear Unit (ReLU)
         # 3. Aplicar la segunda capa lineal (fc2)
        return self.fc2(self.relu(self.fc1(x)))

## Posicionamiento en el codificador

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        # Inicializa una matriz de codificación posicional.
        pe = torch.zeros(max_seq_length, d_model)

        # Crea un tensor de posición con valores [0, 1, 2, ..., max_seq_length-1]
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)

        # Calcula div_term utilizado para el cálculo de senos y cosenos
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        # Calcula los componentes seno y coseno de la codificación posicional.
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Registra la codificación posicional como un búfer (no entrenable)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Agrega la codificación posicional al tensor de entrada a lo largo de la dimensión de la secuencia
        return x + self.pe[:, :x.size(1)]

## Codificador

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        # Capa de autoatención de múltiples cabezales
        self.self_attn = MultiHeadAttention(d_model, num_heads)

        # Capa FeedForward según la posición
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)

        # Normalización de capas tanto para atención como para FeedForward
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Calculo de "abandono" por regularización
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Autoatención de múltiples cabezales: aplica el mecanismo de atención a la entrada
        attn_output = self.self_attn(x, x, x, mask)

        # Conexión residual y normalización de capas (Subcapa 1).
        x = self.norm1(x + self.dropout(attn_output))

        # Feedforward por posición: aplica la transformación feedforward
        ff_output = self.feed_forward(x)

        # Conexión residual y normalización de capas (Subcapa 2).
        x = self.norm2(x + self.dropout(ff_output))

        return x

## Decodificador

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        # Capa de autoatención para la secuencia objetivo
        self.self_attn = MultiHeadAttention(d_model, num_heads)

        # Capa de atención cruzada (atención codificador-decodificador)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)

        # Capa FeedForward según la posición
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)

        # Normalización de capas para las tres subcapas.
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        # Calculo de "abandono" por regularización
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask, tgt_mask):
        # Capa de autoatención para la secuencia objetivo
        self_attn_output = self.self_attn(x, x, x, tgt_mask)

        # Conexión residual y normalización de capas para la autoatención
        x = self.norm1(x + self.dropout(self_attn_output))

        # Capa de atención cruzada (atención codificador-decodificador aplicada)
        cross_attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)

        x = self.norm2(x + self.dropout(cross_attn_output))

        ff_output = self.feed_forward(x)

        x = self.norm3(x + self.dropout(ff_output))

        return x

## Transformador

In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()

        # Incrustaciones de origen y destino: convierte los ID de token en vectores.
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)

        # Codificación posicional: agrega información posicional a las incrustaciones.
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        # Capas de codificador y decodificador apiladas durante el tiempo determinado en 'num_layers'.
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        # Capa lineal final: mapas desde d_model hasta el tamaño del vocabulario objetivo.
        self.fc = nn.Linear(d_model, tgt_vocab_size)

        # Calculo de "abandono" por regularización, evita el overfitting
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        # Máscara de origen: enmascara los valores acolchados (donde el ID del token es 0).
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)

        # Máscara de destino: hace lo mismo para el objetivo, pero también enmascara valores futuros para garantizar un comportamiento autorregresivo.
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask

        return src_mask, tgt_mask

    def forward(self, src, tgt):
        # Genera máscaras para secuencias de origen y de destino.
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        # Aplicar incrustaciones y codificaciones posicionales.
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        # Pasa la secuencia fuente a través de las capas del codificador.
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        # Pasa la secuencia de destino y la salida del codificador a través de las capas del decodificador.
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        # Asigna la salida del decodificador al tamaño del vocabulario objetivo.
        output = self.fc(dec_output)

        return output

### ***Entrenamiento con informacion generada de forma aleatoria***

Solo con propositos de prueba, no es recomendable para cualquier operacion que deseemos hacer con el Transformer.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Definimos hiperparámetros, todos crean "palabras" de forma aleatoria
src_vocab_size = 5000
tgt_vocab_size = 5000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1
learning_rate = 0.001
batch_size = 64
num_epochs = 50

# Definimos el modelo de transformador que ya desarrollamos
transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Define la función de pérdida y el optimizador.
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(transformer.parameters(), lr=learning_rate)

# Genera datos de muestra aleatorios (De ser posible se pueden reemplazar con datos reales)
src_data = torch.randint(1, src_vocab_size, (batch_size, max_seq_length))  # (batch_size, seq_length)
tgt_data = torch.randint(1, tgt_vocab_size, (batch_size, max_seq_length))  # (batch_size, seq_length)

# Bucle de entrenamiento
for epoch in range(num_epochs):
    transformer.train()  # Establece el modelo en modo de entrenamiento.

    # Itera a través de lotes
    for batch_start in range(0, len(src_data), batch_size):
        # Obtiene un lote de datos de origen y de destino.
        src_batch = src_data[batch_start:batch_start+batch_size]
        tgt_batch = tgt_data[batch_start:batch_start+batch_size]

        # Pone los gradientes en cero.
        optimizer.zero_grad()

        # Pase adelantado
        output = transformer(src_batch, tgt_batch[:, :-1])  # Exclude the last token in target

        # Calculo de la perdida
        loss = criterion(output.view(-1, tgt_vocab_size), tgt_batch[:, 1:].contiguous().view(-1))

        # Propagación hacia atrás
        loss.backward()
        optimizer.step()

    # Pérdida de impresión por cada "epoch".
    print(f'Epoch [{epoch+1}/{num_epochs}] Loss: {loss.item()}')

Epoch [1/50] Loss: 8.681730270385742
Epoch [2/50] Loss: 8.5138521194458
Epoch [3/50] Loss: 8.263517379760742
Epoch [4/50] Loss: 8.744540214538574
Epoch [5/50] Loss: 8.185982704162598
Epoch [6/50] Loss: 8.140958786010742
Epoch [7/50] Loss: 8.07398796081543
Epoch [8/50] Loss: 7.890829563140869
Epoch [9/50] Loss: 7.7089080810546875
Epoch [10/50] Loss: 8.014005661010742
Epoch [11/50] Loss: 7.812922954559326
Epoch [12/50] Loss: 7.4533233642578125
Epoch [13/50] Loss: 7.65604829788208
Epoch [14/50] Loss: 7.314331531524658
Epoch [15/50] Loss: 7.4651875495910645
Epoch [16/50] Loss: 7.060393810272217
Epoch [17/50] Loss: 6.679470062255859
Epoch [18/50] Loss: 6.681331157684326
Epoch [19/50] Loss: 6.412799835205078
Epoch [20/50] Loss: 6.313271999359131
Epoch [21/50] Loss: 5.993490219116211
Epoch [22/50] Loss: 6.005606174468994
Epoch [23/50] Loss: 5.754429817199707
Epoch [24/50] Loss: 5.888615608215332
Epoch [25/50] Loss: 5.1952738761901855
Epoch [26/50] Loss: 5.019556045532227
Epoch [27/50] Loss: 4

### ***Dandole Proposito: Traduccion del Ingles al Español.***

Viendo que no se presento ningun inconveniente entrenandolo con parametros aleatorios, procederemos a darle un proposito con el que podamos interactuar.

In [None]:
import pathlib
import random
import string
import re
import numpy as np
import tensorflow as tf
from tensorflow import keras

text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

# Cargar datos
with open(text_file, 'r', encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
    sentence_pairs = [line.split('\t') for line in lines]

# Dividir datos en partes de inglés y español
english_sentences, spanish_sentences = zip(*sentence_pairs)

# Dividir datos en conjuntos de entrenamiento, validación y prueba.
data_size = len(english_sentences)
train_size = int(0.8 * data_size)
valid_size = int(0.1 * data_size)

english_train, english_valid, english_test = english_sentences[:train_size], english_sentences[train_size:train_size+valid_size], english_sentences[train_size+valid_size:]
spanish_train, spanish_valid, spanish_test = spanish_sentences[:train_size], spanish_sentences[train_size:train_size+valid_size], spanish_sentences[train_size+valid_size:]

In [None]:
from collections import Counter

# Vocabulario y codificación de enteros

def build_vocab(sentences, specials=('<pad>', '<sos>', '<eos>', '<unk>')):
    # Aplana la lista de oraciones y cuenta las ocurrencias.

    word_freq = Counter(token for sentence in sentences for token in sentence)

    # Ordena por frecuencia y luego lexicográficamente

    ordered_vocab = [pair[0] for pair in word_freq.most_common()]
    vocab = tuple(specials) + tuple(ordered_vocab)
    stoi = {word: i for i, word in enumerate(vocab)}
    return vocab, stoi

english_vocab, english_stoi = build_vocab(english_train)
spanish_vocab, spanish_stoi = build_vocab(spanish_train)

In [None]:
#Tokenizacion

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def tokenize(sentences, language="english"):
    return [word_tokenize(sentence.lower(), language=language) for sentence in sentences]

english_train_tok = tokenize(english_train)
english_valid_tok = tokenize(english_valid)
english_test_tok = tokenize(english_test)

spanish_train_tok = tokenize(spanish_train, language="spanish")
spanish_valid_tok = tokenize(spanish_valid, language="spanish")
spanish_test_tok = tokenize(spanish_test, language="spanish")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Vocabularios

english_vocab, english_stoi = build_vocab(english_train_tok)
spanish_vocab, spanish_stoi = build_vocab(spanish_train_tok)

In [None]:
def encode(sentences, stoi):
    sos_index = stoi['<sos>']
    eos_index = stoi['<eos>']
    unk_index = stoi['<unk>']

    encoded_sentences = []
    for sentence in sentences:
        encoded_sentence = [sos_index] + [stoi.get(token, unk_index) for token in sentence] + [eos_index]
        encoded_sentences.append(encoded_sentence)

        # Debugging problematic sentence:
        if max(encoded_sentence) >= len(stoi):
            print("Problematic sentence:", sentence)
            print("Encoded:", encoded_sentence)

    return encoded_sentences

english_train_enc = encode(english_train_tok, english_stoi)
english_valid_enc = encode(english_valid_tok, english_stoi)
english_test_enc = encode(english_test_tok, english_stoi)

spanish_train_enc = encode(spanish_train_tok, spanish_stoi)
spanish_valid_enc = encode(spanish_valid_tok, spanish_stoi)
spanish_test_enc = encode(spanish_test_tok, spanish_stoi)

In [None]:
from torch.utils.data import Dataset, DataLoader

def collate_fn(batch):
    src_sentences = [item["src"] for item in batch]
    tgt_sentences = [item["tgt"] for item in batch]

    # Pad sequences for this batch
    src_sentences_padded = torch.nn.utils.rnn.pad_sequence(src_sentences, batch_first=True, padding_value=0)
    tgt_sentences_padded = torch.nn.utils.rnn.pad_sequence(tgt_sentences, batch_first=True, padding_value=0)

    return {"src": src_sentences_padded, "tgt": tgt_sentences_padded}

class TranslationDataset(Dataset):
    def __init__(self, src_sentences, tgt_sentences):
        self.src_sentences = src_sentences
        self.tgt_sentences = tgt_sentences

    def __len__(self):
        return len(self.src_sentences)

    def __getitem__(self, index):
        return {
            "src": torch.tensor(self.src_sentences[index], dtype=torch.long),
            "tgt": torch.tensor(self.tgt_sentences[index], dtype=torch.long)
        }

# Creating datasets
train_dataset = TranslationDataset(english_train_enc, spanish_train_enc)
valid_dataset = TranslationDataset(english_valid_enc, spanish_valid_enc)
test_dataset = TranslationDataset(english_test_enc, spanish_test_enc)

# Creating dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [None]:
# Tamaño de los datasets

print(f"Size of training dataset: {len(train_dataset)}")
print(f"Size of validation dataset: {len(valid_dataset)}")
print(f"Size of test dataset: {len(test_dataset)}")

Size of training dataset: 95171
Size of validation dataset: 11896
Size of test dataset: 11897


In [None]:
src_vocab_size = len(english_vocab)
tgt_vocab_size = len(spanish_vocab)

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

criterion = nn.CrossEntropyLoss(ignore_index=0)  # 0 is <pad>
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train()

for epoch in range(1):
    total_loss = 0
    for batch in train_loader:
        src, tgt = batch["src"], batch["tgt"]

        optimizer.zero_grad()
        output = transformer(src, tgt[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch: {epoch+1}, Loss: {total_loss / len(train_loader)}")


Epoch: 1, Loss: 3.311071655682979


### **Evaluemos el esfuerzo**

Tomo 2h17m31s entrenar solo un epoch de este modelo... mas computo requerido.

In [None]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            src, tgt = batch["src"], batch["tgt"]
            output = model(src, tgt[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt)
            total_loss += loss.item()

    return total_loss / len(dataloader)

test_loss = evaluate(transformer, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}")

Test Loss: 3.8270


In [None]:
import spacy
spacy_eng = spacy.load("en_core_web_sm")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
    model.eval()

    # Tokeniza oraciones y las convierte en tokens numéricos.
    tokens = [token.text.lower() for token in spacy_eng(sentence)]
    tokens = [src_field["<sos>"]] + tokens + [src_field["<eos>"]]
    src_indexes = [src_field.get(token, src_field["<unk>"]) for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    # Crear un tensor objetivo vacío
    trg_indexes = [trg_field["<sos>"]]

    with torch.no_grad():
        for i in range(max_len):
            trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
            output = model(src_tensor, trg_tensor)
            pred_token = output.argmax(2)[:, -1].item()
            trg_indexes.append(pred_token)

            # Deja de predecir si se alcanza el token de final de oración
            if pred_token == trg_field["<eos>"]:
                break

    trg_tokens = [list(trg_field.keys())[i] for i in trg_indexes]

    # Devuelve la frase traducida (sin tokens <sos> y <eos>)
    return trg_tokens[1:-1]

# Prueba
src_sentence = "I love machine learning."
translation = translate_sentence(src_sentence, english_stoi, spanish_stoi, transformer, device)
print("Source Sentence:", src_sentence)
print("Translated Sentence:", ' '.join(translation))

Source Sentence: I love machine learning.
Translated Sentence: me encanta el amor a las dos y yo .


Definitivamente, mas computo, no hay suficiente maquina, no es en broma que para entrenar cualquier forma de maquina inteligente se requiere mucho mas CPU/GPU.

In [37]:
def translate_tokenized_sentence(tokens, src_field, trg_field, model, device, max_len=50):
    model.eval()

    # Convierte a tokens numéricos y agregue <sos> y <eos>
    tokens = [src_field["<sos>"]] + tokens + [src_field["<eos>"]]
    src_indexes = [src_field.get(token, src_field["<unk>"]) for token in tokens]

    # Se convierte a tensor y agrega dimensión por lotes
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    # Iniciar la traducción con el token <sos>
    trg_indexes = [trg_field["<sos>"]]

    for _ in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)
        pred_token = output.argmax(2)[:, -1].item()
        trg_indexes.append(pred_token)
        if pred_token == trg_field["<eos>"]:
            break

    # Convierte los índices de la traducción a palabras.
    trg_tokens = [list(trg_field.keys())[list(trg_field.values()).index(i)] for i in trg_indexes]

    # Remueve los tokens <sos> y <eos>
    return trg_tokens[1:-1]

In [38]:
tokenized_sentence = ["i", "love", "machine", "learning"]
translation = translate_tokenized_sentence(tokenized_sentence, english_stoi, spanish_stoi, transformer, device)
print("Tokenized Source Sentence:", tokenized_sentence)
print("Translated Sentence:", ' '.join(translation))

Tokenized Source Sentence: ['i', 'love', 'machine', 'learning']
Translated Sentence: me encanta el amor a las mujeres me encanta .


Sin Comentarios.