## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Tokenize and numericalize your samples
def vectorize_en(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab[token] for token in tokens] + [vocab["<eos>"]]
    token_ids = tokens[:sequence_length] + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return torch.tensor(token_ids, dtype=torch.long)

# Tokenize and numericalize your samples
def vectorize_vn(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab["<sos>"]] + [vocab[token] for token in tokens] + [vocab["<eos>"]]
    token_ids = tokens[:sequence_length] + [vocab["<pad>"]] * (sequence_length - len(tokens))
    return torch.tensor(token_ids, dtype=torch.long)

In [3]:
corpus_en = [
    "learning english",
    "build ai model"    
]
data_size_en = len(corpus_en)

# max vocabulary size and sequence length
vocab_size_en = 8
sequence_length_en = 4

In [4]:
# Create vocabulary
vocab_en = build_vocab_from_iterator(yield_tokens(corpus_en),
                                     max_tokens=vocab_size_en,
                                     specials=["<unk>", "<pad>", "<eos>"])
vocab_en.set_default_index(vocab_en["<unk>"])
vocab_en.get_stoi()

{'model': 7,
 'learning': 6,
 'english': 5,
 'build': 4,
 'ai': 3,
 '<eos>': 2,
 '<pad>': 1,
 '<unk>': 0}

In [5]:
# Vectorize the samples
corpus_ids_en = []
for sentence in corpus_en:
    corpus_ids_en.append(vectorize_en(sentence, vocab_en, sequence_length_en))

# print
for v in corpus_ids_en:
    print(v)

tensor([6, 5, 2, 1])
tensor([4, 3, 7, 2])


In [6]:
corpus_vn = [
    "học tiếng anh",
    "xây mô hình ai"    
]
data_size_vn = len(corpus_vn)

# max vocabulary size and sequence length
vocab_size_vn = 12
sequence_length_vn = 6

In [7]:
# Create vocabulary
vocab_vn = build_vocab_from_iterator(yield_tokens(corpus_vn),
                                  max_tokens=vocab_size_vn,
                                  specials=["<unk>", "<pad>", "<sos>", "<eos>"])
vocab_vn.set_default_index(vocab_vn["<unk>"])
vocab_vn.get_stoi()

{'tiếng': 9,
 'học': 7,
 'anh': 5,
 '<sos>': 2,
 'xây': 10,
 'mô': 8,
 'hình': 6,
 'ai': 4,
 '<eos>': 3,
 '<pad>': 1,
 '<unk>': 0}

In [8]:
# Vectorize the samples
corpus_ids_vn = []
for sentence in corpus_vn:
    corpus_ids_vn.append(vectorize_vn(sentence, vocab_vn, sequence_length_vn))

# print
for v in corpus_ids_vn:
    print(v)

tensor([2, 7, 9, 5, 3, 1])
tensor([ 2, 10,  8,  6,  4,  3])


## Train with full data

In [9]:
class Encoder(nn.Module):
    def __init__(self, vocab_size_en, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size_en, embedding_dim)
        self.custom_weights = torch.tensor([[-0.1,  0.5],
                                            [ 1.7, -0.8],
                                            [ 1.0, -1.9],
                                            [-1.3, -0.1],
                                            [ 0.2,  1.3],
                                            [ 0.4, -0.6],
                                            [ 0.5,  0.1],
                                            [ 0.4, -1.3]]).float()
        self.embedding.weight = nn.Parameter(self.custom_weights)

        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.rnn.bias_ih_l0 = nn.Parameter(torch.tensor([0.4,  0.5, 0.2]).float())
        self.rnn.bias_hh_l0 = nn.Parameter(torch.tensor([ 0.1, -0.2, 0.1]).float())
        self.rnn.weight_ih_l0 = nn.Parameter(torch.tensor( [[-0.4,  0.1],
                                                            [ 0.4, -0.4],
                                                            [ 0.1, 0.2]]).float())
        self.rnn.weight_hh_l0 = nn.Parameter(torch.tensor( [[-0.5,  0.1, 0.1],
                                                            [-0.2, -0.2, 0.1],
                                                            [-0.2, -0.2, 0.2]]).float())

    def forward(self, src):
        embedded = self.embedding(src)     
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

In [10]:
input_sample = torch.tensor([[6, 5, 2, 1]], dtype=torch.long)

embedding_dim, hidden_dim = 2, 3
encoder = Encoder(vocab_size_en, embedding_dim, hidden_dim)

outputs, hidden = encoder(input_sample)

In [11]:
class Decoder(nn.Module):
    def __init__(self, vocab_size_vn, embedding_dim, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size_vn, embedding_dim)
        self.custom_weights = torch.tensor([[ 0.3, -0.7],
                                            [-1.1,  0.9],
                                            [ 0.8, -1.4],
                                            [-0.5,  0.2],
                                            [ 1.2, -0.3],
                                            [-0.9,  1.5],
                                            [ 0.6, -0.2],
                                            [-1.7,  0.4],
                                            [ 0.2,  1.1],
                                            [-1.0,  0.8],
                                            [ 1.4, -1.2],
                                            ]).float()
        self.embedding.weight = nn.Parameter(self.custom_weights)

        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.rnn.bias_ih_l0 = nn.Parameter(torch.tensor([0.3, -0.1, 0.6]).float())
        self.rnn.bias_hh_l0 = nn.Parameter(torch.tensor([-0.2, 0.4, -0.3]).float())
        self.rnn.weight_ih_l0 = nn.Parameter(torch.tensor([[0.2, -0.3],
                                                            [-0.5,  0.6],
                                                            [ 0.1,  0.4]]).float())
        self.rnn.weight_hh_l0 = nn.Parameter(torch.tensor([[-0.3,  0.2, -0.4],
                                                            [ 0.5, -0.1,  0.3],
                                                            [ 0.2, -0.5,  0.1]]).float())
    
    def forward(self, input, hidden):
        embedded = self.embedding(input)     
        output, hidden = self.rnn(embedded, hidden)        
        return output, hidden

In [12]:
target_sample = torch.tensor([[2, 7, 9]], dtype=torch.long)   # [[2, 6, 5, 8, 3]]
decoder = Decoder(vocab_size_vn, embedding_dim, hidden_dim)

prediction, hidden= decoder(target_sample, hidden)

In [13]:
class Seq2Seq_Model(nn.Module):
    def __init__(self, encoder, decoder, vocab_size_vn):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder 
        self.fc_out = nn.Linear(decoder.hidden_dim, vocab_size_vn)
        self.fc_out.weight = nn.Parameter(torch.tensor([[-0.52,  0.25, -0.35],
                                                        [ 0.06, -0.16,  0.18],
                                                        [-0.52,  0.33,  0.02],
                                                        [-0.12, -0.57, -0.4],
                                                        [-0.18, -0.5, -0.01],
                                                        [ 0.35, -0.03,  0.46],
                                                        [-0.13,  0.16,  0.23],
                                                        [-0.99,  0.31, -0.24],
                                                        [-0.74, -0.25,  0.5],
                                                        [-0.44, -0.02, -0.39],
                                                        [-0.35, -0.43,  0.35],
                                                        [ 0.26, -0.49,  0.09]]).float())
        self.fc_out.bias = nn.Parameter(torch.tensor([0.12, -0.21, -0.15, 
                                                      -0.54,  0.36,  0.49,  
                                                      0.49,  0.07, 0.23,  
                                                      0.52,  0.29, -0.27]).float())

    def forward(self, sequence_en, sequence_vn):
        en_output, en_hidden = self.encoder(sequence_en)
        de_output, de_hidden = self.decoder(sequence_vn, en_hidden)
        outputs = self.fc_out(de_hidden.squeeze(0))
        return outputs

In [14]:
model = Seq2Seq_Model(encoder, decoder, vocab_size_vn)
print(model)

Seq2Seq_Model(
  (encoder): Encoder(
    (embedding): Embedding(8, 2)
    (rnn): RNN(2, 3, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(12, 2)
    (rnn): RNN(2, 3, batch_first=True)
  )
  (fc_out): Linear(in_features=3, out_features=12, bias=True)
)


In [15]:
en_sample = torch.tensor([[6, 5, 2, 1]], dtype=torch.long)
vn_sample = torch.tensor([[2, 7, 9]], dtype=torch.long)
target = torch.tensor([6], dtype=torch.long)

outputs = model(en_sample, vn_sample)
print((outputs))

prediction = torch.argmax(outputs, dim=1)
print(prediction)

tensor([[ 4.5245e-01, -3.5172e-01,  2.5774e-01, -9.8450e-01, -9.5534e-04,
          3.8520e-01,  6.6128e-01,  5.7886e-01,  2.3522e-01,  6.0968e-01,
          4.0912e-02, -7.3660e-01]], grad_fn=<AddmmBackward0>)
tensor([6])


In [16]:
criterion = nn.CrossEntropyLoss()

loss = criterion(outputs, target)
print(loss)

tensor(2.0306, grad_fn=<NllLossBackward0>)
