## Data

In [1]:
import torch
import torch.nn as nn
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

#### corpus_generation

In [2]:
corpus_generation = [
    "ăn quả nhớ kẻ trồng cây",
    "làm giàu không khó"    
]

# max vocabulary size and sequence length
vocab_size_generation = 14
sequence_length_generation = 7

In [3]:
# Initialize the tokenizer and define a trainer
tokenizer_generation = Tokenizer(WordLevel())
tokenizer_generation.pre_tokenizer = Whitespace()
tokenizer_generation.enable_padding(pad_id=1, 
                                    pad_token="<pad>", 
                                    length=sequence_length_generation)
tokenizer_generation.enable_truncation(max_length=sequence_length_generation)

# Train the tokenizer on your corpus
trainer_generation = WordLevelTrainer(vocab_size=vocab_size_generation, 
                                      special_tokens=["<unk>", "<pad>", "<sos>", "<eos>"])
tokenizer_generation.train_from_iterator(corpus_generation, trainer_generation)

In [4]:
data_x = []
data_y = []
for vector in corpus_generation:
    vector = ['<sos>'] + vector.split() + ['<eos>']
    data_x.append( ' '.join(vector[:-1]) )
    data_y.append( ' '.join(vector[1:]) )

print(data_x)
print(data_y)

['<sos> ăn quả nhớ kẻ trồng cây', '<sos> làm giàu không khó']
['ăn quả nhớ kẻ trồng cây <eos>', 'làm giàu không khó <eos>']


In [5]:
# Tokenize and numericalize your samples
def vectorize_generation(x, y, tokenizer_generation):     
    x_ids = tokenizer_generation.encode(x).ids
    y_ids = tokenizer_generation.encode(y).ids
    print(x_ids, y_ids)
    return x_ids, y_ids

# Vectorize the samples
data_x_ids = []
data_y_ids = []
for x, y in zip(data_x, data_y):
    x_ids, y_ids = vectorize_generation(x, y, tokenizer_generation)
    data_x_ids.append(x_ids)
    data_y_ids.append(y_ids)

data_x_ids = torch.tensor(data_x_ids, dtype=torch.long)
data_y_ids = torch.tensor(data_y_ids, dtype=torch.long)

[2, 13, 11, 10, 8, 12, 4] [13, 11, 10, 8, 12, 4, 3]
[2, 9, 5, 7, 6, 1, 1] [9, 5, 7, 6, 3, 1, 1]


#### topics

In [7]:
topics = [
    'khuyên răn',
    'kinh doanh'
]
vocab_size_context = 6
sequence_length_context = 2

In [8]:
# Initialize the tokenizer and define a trainer
tokenizer_context = Tokenizer(WordLevel())
tokenizer_context.pre_tokenizer = Whitespace()
tokenizer_context.enable_padding(pad_id=1, 
                                 pad_token="<pad>", 
                                 length=sequence_length_context)
tokenizer_context.enable_truncation(max_length=sequence_length_context)

# Train the tokenizer on your corpus
trainer_context = WordLevelTrainer(vocab_size=vocab_size_context, 
                                   special_tokens=["<unk>", "<pad>"])
tokenizer_context.train_from_iterator(topics, trainer_context)

In [9]:
topics_ids = []
for vector in topics:
    topics_ids.append(vector)

topics_ids2 = []
for x in topics:
    x_ids = tokenizer_context.encode(x).ids
    topics_ids2.append(x_ids)

# print
print(topics_ids)
print(topics_ids2)

['khuyên răn', 'kinh doanh']
[[3, 5], [4, 2]]


In [10]:
topics_tensor = torch.tensor(topics_ids2, dtype=torch.long)
print(topics_tensor)

tensor([[3, 5],
        [4, 2]])


## Train with full data

In [11]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, model_dim, nhead):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.transformer_encoder = nn.TransformerEncoderLayer(d_model=model_dim, 
                                                              nhead=nhead, 
                                                              dim_feedforward=6,
                                                              batch_first=True)

    def forward(self, src):
        # src = [batch_size, seq_length]
        
        embedded = self.embedding(src)                  
        # [batch_size, seq_length, embedding_dim]
        
        context = self.transformer_encoder(embedded)        
        return context

In [12]:
embedding_dim, model_dim, nhead = 6, 6, 2
encoder = Encoder(vocab_size_context, embedding_dim, model_dim, nhead)

context_sample = encoder(topics_tensor)
print(context_sample.shape)

torch.Size([2, 2, 6])


In [13]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, model_dim, nhead):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.transformer_decoder = nn.TransformerDecoderLayer(d_model=model_dim, 
                                                        nhead=nhead, 
                                                        dim_feedforward=6,
                                                        batch_first=True)
        self.fc_out = nn.Linear(model_dim, vocab_size)

    def forward(self, input, context):
        # input = [batch_size, seq_length_vn]   
        # context = [batch_size, seq_length_en, model_dim]   
                
        embedded = self.embedding(input)
        # embedded = [batch_size, seq_length_vn, embedding_dim]
        
        output = self.transformer_decoder(embedded, context)
        # output = [batch_size, seq_length_vn, model_dim]
        
        prediction = self.fc_out(output)
        # prediction = [batch_size, vocab_size_vn]
        
        return prediction.permute(0, 2, 1)

In [14]:
decoder = Decoder(vocab_size_generation, embedding_dim, model_dim, nhead)

prediction = decoder(data_x_ids, context_sample)
print(prediction.shape)

torch.Size([2, 14, 7])


In [15]:
class Seq2Seq_Model(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder  

    def forward(self, sequence_encoder, sequence_decoder):        
        context = self.encoder(sequence_encoder)
        outputs = self.decoder(sequence_decoder, context)
            
        return outputs

In [16]:
model = Seq2Seq_Model(encoder, decoder)
outputs = model(topics_tensor, data_x_ids)
print(outputs.shape)

torch.Size([2, 14, 7])


In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [18]:
# train
for _ in range(40):
    optimizer.zero_grad()
    outputs = model(topics_tensor, data_x_ids)
    loss = criterion(outputs, data_y_ids)
    #print(loss.item())
    loss.backward()
    optimizer.step()

In [19]:
outputs = model(topics_tensor, data_x_ids)
#print(outputs)
print(torch.argmax(outputs, axis=1))

tensor([[13, 11, 10,  8, 12,  4,  3],
        [ 9,  5,  7,  6,  3,  1,  1]])


In [20]:
data_y_ids

tensor([[13, 11, 10,  8, 12,  4,  3],
        [ 9,  5,  7,  6,  3,  1,  1]])

In [21]:
# check
topic1 = topics_tensor[0:1,:]
print(topic1)

data_x_id1 = data_x_ids[0:1,:]
print(data_x_id1)

data_y_id1 = data_y_ids[0:1,:]
print(data_y_id1)

tensor([[3, 5]])
tensor([[ 2, 13, 11, 10,  8, 12,  4]])
tensor([[13, 11, 10,  8, 12,  4,  3]])


In [22]:
outputs = model(topic1, data_x_id1)
print(outputs.shape)
print(torch.argmax(outputs, axis=1))

torch.Size([1, 14, 7])
tensor([[13, 11, 10,  8, 12,  4,  3]])


In [23]:
# check
topic1 = topics_tensor[1:2,:]
print(topic1)

data_x_id1 = data_x_ids[1:2,:]
print(data_x_id1)

data_y_id1 = data_y_ids[1:2,:]
print(data_y_id1)

tensor([[4, 2]])
tensor([[2, 9, 5, 7, 6, 1, 1]])
tensor([[9, 5, 7, 6, 3, 1, 1]])


In [24]:
outputs = model(topic1, data_x_id1)
print(outputs.shape)
print(torch.argmax(outputs, axis=1))

torch.Size([1, 14, 7])
tensor([[9, 5, 7, 6, 3, 1, 1]])


## Inference

In [25]:
promt = '<sos>'
promt_length = 1
promt_ids = tokenizer_generation.encode(promt).ids
print(promt_ids)

[2, 1, 1, 1, 1, 1, 1]


In [26]:
topic = 'kinh doanh'   # 'kinh doanh' ; 'khuyên răn'
topic_ids = tokenizer_context.encode(topic).ids
topic_tensor = torch.tensor(topic_ids, dtype=torch.long).reshape(1, -1)
print(topic_tensor)

tensor([[4, 2]])


In [27]:
for i in range(sequence_length_generation - promt_length):
    promt_tensor = torch.tensor(promt_ids, 
                                dtype=torch.long).reshape(1, -1)
    outputs = model(topic_tensor, promt_tensor)
    outputs = torch.argmax(outputs, axis=1)   
    next_id = outputs[0][promt_length+i-1]

    promt_ids[promt_length+i] = next_id.item()
print(promt_ids)

[2, 9, 5, 7, 6, 3, 9]


In [28]:
print(tokenizer_generation.get_vocab())

{'<pad>': 1, 'ăn': 13, 'làm': 9, 'giàu': 5, '<unk>': 0, 'không': 7, '<eos>': 3, 'trồng': 12, '<sos>': 2, 'cây': 4, 'kẻ': 8, 'quả': 11, 'khó': 6, 'nhớ': 10}
