# Neural Machine Translation with CrossAttention

© Data Trainers LLC. GPL v 3.0.

Author: Axel Sirota

Inspired highly on the tutorial [NMT with Attention](https://www.tensorflow.org/text/tutorials/nmt_with_attention) which takes the code from the original Seq2Seq with MHA attention [Effective Approaches to Attention-based Neural Machine Translation](https://arxiv.org/abs/1508.04025v5) (Luong et al., 2015).

## Prep


### Installations and imports

In [1]:
!pip install --upgrade  textblob gensim pytorch-nlp swifter




In [2]:

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import itertools
import sys
from textblob import TextBlob, Word
import numpy as np
import random
import re
import swifter
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import os
import pandas as pd
import gensim
import warnings
import nltk

encoder_embedding_size = 64
decoder_embedding_size = 64
hidden_size = 512  # number of features in the hidden state
num_layers = 1
encoder_dropout = 0.2
decoder_dropout = 0.2
num_epochs = 100
lr = 0.01


def set_seeds_and_trace():
  torch.manual_seed(0)
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  random.seed(42)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')
set_seeds_and_trace()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Download and prepare the dataset

The steps you need to take to prepare the data:

1. Add a *start* and *end* token to each sentence.
2. Clean the sentences by removing special characters.
3. Create a word index and reverse word index (dictionaries mapping from word → id and id → word).
4. Pad each sentence to a maximum length.


In [3]:
%%writefile get_data.sh
if [ ! -f spa.txt ]; then
  wget -O spa.txt https://www.dropbox.com/s/ke42pnpydmy6oa6/spa.txt?dl=0
fi

Overwriting get_data.sh


In [4]:
!bash get_data.sh

In [5]:
! head spa.txt

Go.	Ve.
Go.	Vete.
Go.	Vaya.
Go.	Váyase.
Hi.	Hola.
Run!	¡Corre!
Run.	Corred.
Who?	¿Quién?
Fire!	¡Fuego!
Fire!	¡Incendio!


In [6]:
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context for target, context in pairs])
  target = np.array([target for target, context in pairs])

  return target, context

In [7]:
import pathlib
target_raw, context_raw = load_data(pathlib.Path('./spa.txt'))
print(context_raw[-1])

Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.


In [8]:
print(target_raw[-1])

If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.


In [9]:
import re
def preprocess_text(text, should_join=True):
    text = str(text)
    text = ' '.join(str(word).lower() for word in textblob_tokenizer(text))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)


In [10]:
import swifter
spanish_raw = pd.Series(context_raw).swifter.apply(preprocess_text).to_list()
print(spanish_raw[:15])

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

['ve', 'vete', 'vaya', 'váyase', 'hola', 'corre', 'corred', 'quién', 'fuego', 'incendio', 'disparad', 'ayuda', 'socorro auxilio', 'auxilio', 'salta']


In [11]:
import swifter
english_raw = pd.Series(target_raw).swifter.apply(preprocess_text).to_list()
print(english_raw[:15])

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

['go', 'go', 'go', 'go', 'hi', 'run', 'run', 'who', 'fire', 'fire', 'fire', 'help', 'help', 'help', 'jump']


In [12]:

import itertools
from torchnlp.encoders import LabelEncoder


list_of_words = list(itertools.chain.from_iterable([sentence.split() for sentence in spanish_raw]))
ids_from_words_spa = LabelEncoder(list_of_words, reserved_labels=['UNK'], unknown_index=0, min_occurrences=1)
input_size_encoder = len(ids_from_words_spa.token_to_index)

In [13]:
list_of_words = list(itertools.chain.from_iterable([sentence.split() for sentence in english_raw]))
ids_from_words_eng = LabelEncoder(list_of_words, reserved_labels=['UNK'], unknown_index=0, min_occurrences=1)

input_size_decoder = len(ids_from_words_eng.token_to_index)

In [14]:

def get_maximum_review_length(df):
    pre = df.swifter.apply(lambda x: x.split())
    maximum = pre.swifter.apply(len).max()
    return maximum


maximum_length_spanish = get_maximum_review_length(pd.Series(spanish_raw))
maximum_length_spanish

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

45

In [15]:
maximum_length_english = get_maximum_review_length(pd.Series(english_raw))
maximum_length_english

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

46

In [16]:
# Cool solution
def pad_sequence_of_tokens(x, maxlen, unk_token='UNK'):
  if len(x)<maxlen:
    x.extend([unk_token]*(maxlen-len(x)))
  return x

def get_tensor(x, maximum, ids_from_text_function):
  padding = (0, maximum-ids_from_text_function(x).shape[-1])
  return torch.squeeze(F.pad(ids_from_text_function(x), padding, "constant", 0).to(torch.long))



In [17]:
def get_ids_tensor(srs, maximum, ids_from_text_function):

  processed = srs.swifter.apply(lambda x: pad_sequence_of_tokens(x.split(), maxlen=maximum))
  result = processed.swifter.apply(lambda x: get_tensor(x, maximum, ids_from_text_function)).to_list()
  return torch.stack(result)

In [18]:
spanish_ids = get_ids_tensor(srs=pd.Series(spanish_raw), maximum=maximum_length_spanish, ids_from_text_function=ids_from_words_spa.batch_encode)
spanish_ids.shape

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

torch.Size([118964, 45])

In [19]:
english_ids = get_ids_tensor(srs=pd.Series(english_raw), maximum=maximum_length_english, ids_from_text_function=ids_from_words_eng.batch_encode)
english_ids.shape

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/118964 [00:00<?, ?it/s]

torch.Size([118964, 46])

In [20]:
train_ds = TensorDataset(spanish_ids, english_ids)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True, drop_last=True)


In [21]:
input_example, target_example = train_ds[200]
print("Input :", ' '.join(ids_from_words_spa.batch_decode(input_example)))
print("Target:", ' '.join(ids_from_words_eng.batch_decode(target_example)))


Input : apártate UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK
Target: back off UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK


In [22]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers=1, dropout=0):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=dropout, batch_first=True)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        return output, hidden, cell

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device),
            weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device)
        )
        return hidden[0], hidden[1]

In [23]:
class DotProductAttention(nn.Module):
    def forward(self, query, key, value):
        scores = torch.matmul(query, key.transpose(-2, -1)) / (key.size(-1) ** 0.5)
        attention_weights = torch.softmax(scores, dim=-1)
        context = torch.matmul(attention_weights, value)
        return context, attention_weights

In [24]:
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers=1, dropout=0):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention = DotProductAttention()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(2*hidden_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, input_size)

    def forward(self, x, hidden, cell, encoder_outputs):
        x = self.embedding(x)
        context, attention_weights = self.attention(hidden[-1].unsqueeze(1), encoder_outputs, encoder_outputs)
        lstm_input = torch.cat((x, context), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        output = self.fc(output.squeeze(1))
        return output, hidden, cell, attention_weights

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (
            weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device),
            weight.new(self.num_layers, batch_size, self.hidden_size).zero_().to(device)
        )
        return hidden[0], hidden[1]

In [25]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        target_len = target.shape[1]
        target_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(self.device)
        (encoder_hidden, encoder_cell) = self.encoder.init_hidden(batch_size)
        (decoder_hidden, decoder_cell) = self.decoder.init_hidden(batch_size)
        encoder_outputs, hidden, cell = self.encoder(source, encoder_hidden, encoder_cell)

        decoder_input = target[:, 0].unsqueeze(1)  # First input to the decoder is the <sos> tokens
        for t in range(1, target_len):
            output, hidden, cell, _ = self.decoder(decoder_input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = torch.rand(1) < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = target[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)
        for hidden_state in (encoder_hidden, encoder_cell):
          hidden_state.detach_()
        for hidden_state in (decoder_hidden, decoder_cell):
          hidden_state.detach_()
        return outputs

In [26]:
encoder_net = EncoderLSTM(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, encoder_dropout)
decoder_net = DecoderLSTM(input_size=input_size_decoder, embedding_dim=decoder_embedding_size, hidden_size=hidden_size, num_layers=num_layers, dropout=decoder_dropout)
seq2seq_model = Seq2Seq(encoder_net, decoder_net, device).to(device)


In [27]:
seq2seq_model

Seq2Seq(
  (encoder): EncoderLSTM(
    (embedding): Embedding(25704, 64)
    (lstm): LSTM(64, 512, batch_first=True, dropout=0.2)
  )
  (decoder): DecoderLSTM(
    (attention): DotProductAttention()
    (embedding): Embedding(12891, 512)
    (lstm): LSTM(1024, 512, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=512, out_features=12891, bias=True)
  )
)

In [29]:
x = spanish_ids[:2].to(device)
y = english_ids[:2].to(device)
output = seq2seq_model(x,y)
torch.argmax(F.softmax(output, dim=2), dim=2)

tensor([[    0,  2793,  4677,  5089,  2100,  2635, 10639,  1688,  9380, 10674,
          1688, 10674,  7117,  1688,  1688, 10674, 10674,  3726,  1688, 10674,
         10674,  7117, 10082,  4162,  3445, 11287, 11287,  1650,  5089,  5089,
          5089,  3577, 11199,  8744,  8958,  2012,  2012,  8954,   763,  4713,
         10387,  1688, 10674,  7117,  1688, 10674],
        [    0,  2793,  4677,  5089,  2100,  2635, 10639,  1688,  9380, 10674,
          1688, 10674,  7117,  1688,  1688, 10674, 10674,  3726,  1688, 10674,
         10674,  7117, 10082,  4162,  3445, 11287, 11287,  1650,  5089,  5089,
          5089,  3577, 11199,  8744,  8958,  2012,  2012,  8954,   763,  4514,
           207, 11287,  1650,  5828,  5089,  3577]], device='cuda:0')

In [30]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(seq2seq_model.parameters(), lr=lr)

In [None]:
from torch.autograd import Variable
running_loss = 0
for epoch in range(10):
    seq2seq_model.train()
    for batch, (x, y) in enumerate(train_dl):
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        outputs = seq2seq_model(x,y)
        words_selected = torch.argmax(F.softmax(outputs, dim=2), dim=2).to(torch.float).to(device)
        y = y.to(torch.float).to(device)
        loss = criterion(words_selected, y)
        loss = Variable(loss, requires_grad = True)
        loss.backward()
        running_loss += loss.item()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')


Epoch 1/100, Loss: 41977784.0
Epoch 2/100, Loss: 42546740.0
