## Data

In [1]:
import torch
import torch.nn as nn
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Tokenize and numericalize your samples
def vectorize_en(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab[token] for token in tokens] + [vocab["<eos>"]]
    token_ids = tokens[:sequence_length] + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return token_ids

def vectorize_vn(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab["<sos>"]] + [vocab[token] for token in tokens] + [vocab["<eos>"]]
    token_ids = tokens[:sequence_length] + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return token_ids

In [3]:
corpus_en = [
    "good morning",
    "ai books"    
]
data_size_en = len(corpus_en)

# max vocabulary size and sequence length
vocab_size_en = 7
sequence_length_en = 3

In [4]:
# Create vocabulary
vocab_en = build_vocab_from_iterator(yield_tokens(corpus_en),
                                     max_tokens=vocab_size_en,
                                     specials=["<unk>", "<pad>", "<eos>"])
vocab_en.set_default_index(vocab_en["<unk>"])
vocab_en.get_stoi()

{'morning': 6,
 'good': 5,
 'books': 4,
 'ai': 3,
 '<eos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [5]:
# Vectorize the samples
corpus_ids_en = []
for sentence in corpus_en:
    corpus_ids_en.append(vectorize_en(sentence, vocab_en, sequence_length_en))

# print
en_data = torch.tensor(corpus_ids_en, dtype=torch.long)
print(en_data)    

tensor([[5, 6, 2],
        [3, 4, 2]])


In [6]:
corpus_vn = [
    "chào buổi sáng",
    "sách ai"    
]
data_size_vn = len(corpus_vn)

# max vocabulary size and sequence length
vocab_size_vn = 9
sequence_length_vn = 4

In [7]:
# Create vocabulary
vocab_vn = build_vocab_from_iterator(yield_tokens(corpus_vn),
                                  max_tokens=vocab_size_vn,
                                  specials=["<unk>", "<pad>", "<sos>", "<eos>"])
vocab_vn.set_default_index(vocab_vn["<unk>"])
vocab_vn.get_stoi()

{'sách': 7,
 'sáng': 8,
 'chào': 6,
 'buổi': 5,
 '<sos>': 2,
 'ai': 4,
 '<eos>': 3,
 '<pad>': 1,
 '<unk>': 0}

In [8]:
# Vectorize the samples
corpus_ids_vn = []
for sentence in corpus_vn:
    corpus_ids_vn.append(vectorize_vn(sentence, vocab_vn, sequence_length_vn+1))

# print
print(corpus_ids_vn)

[[2, 6, 5, 8, 3], [2, 7, 4, 3, 1]]


In [9]:
input_vn_data = []
label_vn_data = []

for vector in corpus_ids_vn:
    input_vn_data.append(vector[:-1])
    label_vn_data.append(vector[1:])  

# convert to tensors
input_vn_data = torch.tensor(input_vn_data, dtype=torch.long)
label_vn_data = torch.tensor(label_vn_data, dtype=torch.long)

# print
print(input_vn_data)
print(label_vn_data)

tensor([[2, 6, 5, 8],
        [2, 7, 4, 3]])
tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


## Model

In [10]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_en, embedding_dim, model_dim, nhead):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size_en, embedding_dim)
        self.transformer_encoder = nn.TransformerEncoderLayer(d_model=model_dim, 
                                                              nhead=nhead, 
                                                              dim_feedforward=6,
                                                              dropout=0.0,
                                                              batch_first=True)

    # src = [batch_size, seq_length]
    def forward(self, src):
        embedded = self.embedding(src)                # [batch_size, seq_length, d]        
        context = self.transformer_encoder(embedded)  # [batch_size, seq_length, d]         
        return context

In [11]:
en_data.shape

torch.Size([2, 3])

In [12]:
embedding_dim, model_dim, nhead = 6, 6, 2
encoder = Encoder(vocab_size_en, embedding_dim, model_dim, nhead)

context_sample = encoder(en_data)
print(context_sample.shape)

torch.Size([2, 3, 6])


In [13]:
class Decoder(nn.Module):
    def __init__(self, vocab_size_vn, embedding_dim, model_dim, nhead, sequence_length_vn):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size_vn, embedding_dim)
        self.mask = torch.triu(torch.ones(sequence_length_vn, sequence_length_vn), diagonal=1).bool()  
        self.transformer_decoder = nn.TransformerDecoderLayer(d_model=model_dim, 
                                                              nhead=nhead, 
                                                              dim_feedforward=6,
                                                              dropout=0.0,
                                                              batch_first=True)
        self.fc_out = nn.Linear(model_dim, vocab_size_vn)

    # input: [batch_size, seq_length_vn]   
    # context: [batch_size, seq_length_en, d]
    def forward(self, input, context):                
        embedded = self.embedding(input)                                           # [batch_size, seq_length_vn, d]        
        output = self.transformer_decoder(embedded, context, tgt_mask=self.mask)   # [batch_size, seq_length_vn, d]        
        prediction = self.fc_out(output)                                           # [batch_size, seq_length_vn, vocab_size_vn]
        
        return prediction.permute(0, 2, 1)                                         # [batch_size, vocab_size_vn, seq_length_vn]

In [14]:
decoder = Decoder(vocab_size_vn, embedding_dim, model_dim, nhead, sequence_length_vn)
outputs = decoder(input_vn_data, context_sample)
print(outputs.shape)

torch.Size([2, 9, 4])


In [15]:
class Seq2Seq_Model(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder     

    def forward(self, sequence_en, sequence_vn):        
        context = self.encoder(sequence_en)
        outputs = self.decoder(sequence_vn, context)            
        return outputs

In [16]:
model = Seq2Seq_Model(encoder, decoder)

# test
outputs = model(en_data, input_vn_data)
print(outputs.shape)

torch.Size([2, 9, 4])


In [17]:
label_vn_data.shape

torch.Size([2, 4])

## Train

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [19]:
for _ in range(35):
    optimizer.zero_grad()
    outputs = model(en_data, input_vn_data)
    loss = criterion(outputs, label_vn_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.572211980819702
2.0582990646362305
1.8298975229263306
1.6489864587783813
1.452692985534668
1.3585753440856934
1.2285832166671753
1.0857281684875488
0.884067714214325
0.8026986718177795
0.6825419664382935
0.5574626922607422
0.4896906614303589
0.4031267464160919
0.3443576395511627
0.26172083616256714
0.2054610550403595
0.14212802052497864
0.10413385927677155
0.07778821885585785
0.05209149792790413
0.03987732529640198
0.0316433347761631
0.022833406925201416
0.01573421061038971
0.013697528280317783
0.012465091422200203
0.010541660711169243
0.008088898845016956
0.006039146799594164
0.004743597470223904
0.004063807427883148
0.003617584239691496
0.0032029422000050545
0.0029022207017987967


In [20]:
outputs = model(en_data, input_vn_data)
print(torch.argmax(outputs, axis=1))

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])


In [21]:
label_vn_data

tensor([[6, 5, 8, 3],
        [7, 4, 3, 1]])