## Data

In [5]:
import torch
import torch.nn as nn
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

corpus = [
    "ăn quả nhớ kẻ trồng cây",
    "có chí thì nên"    
]
data_size = len(corpus)

# Define the max vocabulary size and sequence length
vocab_size = 15
sequence_length = 7

In [6]:
# Initialize the tokenizer and define a trainer
tokenizer = Tokenizer(WordLevel())
tokenizer.pre_tokenizer = Whitespace()
tokenizer.enable_padding(pad_id=1, 
                         pad_token="<pad>", 
                         length=sequence_length)
tokenizer.enable_truncation(max_length=sequence_length)

# Train the tokenizer on your corpus
trainer = WordLevelTrainer(vocab_size=vocab_size, 
                           special_tokens=["<unk>", "<pad>", "<sos_topic1>", "<sos_topic2>", "<eos>"])
tokenizer.train_from_iterator(corpus, trainer)

In [7]:
data_x = []
data_y = []

corpus[0] = '<sos_topic1> ' + corpus[0] + ' <eos>'
corpus[1] = '<sos_topic2> ' + corpus[1] + ' <eos>'

for vector in corpus:
    vector = vector.split()
    data_x.append( ' '.join(vector[:-1]) )
    data_y.append( ' '.join(vector[1:]) ) 

print(data_x)
print(data_y)

['<sos_topic1> ăn quả nhớ kẻ trồng cây', '<sos_topic2> có chí thì nên']
['ăn quả nhớ kẻ trồng cây <eos>', 'có chí thì nên <eos>']


In [8]:
# Tokenize and numericalize your samples
def vectorize(x, y, tokenizer, sequence_length):     
    x_ids = tokenizer.encode(x).ids
    y_ids = tokenizer.encode(y).ids
    print(x_ids, y_ids)
    return x_ids, y_ids

# Vectorize the samples
data_x_ids = []
data_y_ids = []
for x, y in zip(data_x, data_y):
    x_ids, y_ids = vectorize(x, y, tokenizer, sequence_length)
    data_x_ids.append(x_ids)
    data_y_ids.append(y_ids)

data_x_ids = torch.tensor(data_x_ids, dtype=torch.long)
data_y_ids = torch.tensor(data_y_ids, dtype=torch.long)

[2, 14, 11, 9, 8, 13, 6] [14, 11, 9, 8, 13, 6, 4]
[3, 7, 5, 12, 10, 1, 1] [7, 5, 12, 10, 4, 1, 1]


## Train with full data

In [9]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=embed_dim, 
                                          num_heads=num_heads,
                                          batch_first=True)
        self.ffn = nn.Linear(in_features=embed_dim, 
                             out_features=ff_dim)
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim)

    def forward(self, query, key, value, mask):
        attn_output, _ = self.attn(query, key, value, attn_mask=mask)
        out_1 = self.layernorm_1(query + attn_output)
        ffn_output = nn.GELU()(self.ffn(out_1))
        out_2 = self.layernorm_2(out_1 + ffn_output)
        return out_2

# test
transformer = TransformerDecoderBlock(6, 2, 6)
t = torch.randn(1, 10, 6)
mask = torch.triu(torch.ones(10, 10), diagonal=1).bool()  
output = transformer(t, t, t, mask)
print(output.shape)

torch.Size([1, 10, 6])


In [10]:
class TG_Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, sequence_length):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim)
        self.mask = torch.triu(torch.ones(sequence_length, sequence_length), 
                               diagonal=1).bool()
        self.transformer = TransformerDecoderBlock(embed_dim, 
                                                   num_heads, 
                                                   embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x, x, self.mask)        
        x = self.linear(x)
        return x.permute(0,2,1)

model = TG_Model(vocab_size, 8, 1, sequence_length)
print(model)

TG_Model(
  (embedding): Embedding(15, 8)
  (transformer): TransformerDecoderBlock(
    (attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
    )
    (ffn): Linear(in_features=8, out_features=8, bias=True)
    (layernorm_1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (layernorm_2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
  )
  (linear): Linear(in_features=8, out_features=15, bias=True)
)


In [11]:
outputs = model(data_x_ids)
print(outputs.shape)

torch.Size([2, 15, 7])


In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [13]:
# train
for _ in range(40):
    optimizer.zero_grad()
    outputs = model(data_x_ids)
    loss = criterion(outputs, data_y_ids)
    #print(loss.item())
    loss.backward()
    optimizer.step()

In [14]:
outputs = model(data_x_ids)
#print(outputs)
print(torch.argmax(outputs, axis=1))

tensor([[14, 11,  9,  8, 13,  6,  4],
        [ 7,  5, 12, 10,  4,  1,  1]])


In [15]:
data_y_ids

tensor([[14, 11,  9,  8, 13,  6,  4],
        [ 7,  5, 12, 10,  4,  1,  1]])

## Inference

In [16]:
promt = '<sos_topic2> có'
promt_length = 2
promt_ids = tokenizer.encode(promt).ids
print(promt_ids)

[3, 7, 1, 1, 1, 1, 1]


In [17]:
for i in range(sequence_length - promt_length):
    promt_tensor = torch.tensor(promt_ids, dtype=torch.long).reshape(1, -1)
    outputs = model(promt_tensor)
    outputs = torch.argmax(outputs, axis=1)   
    next_id = outputs[0][promt_length+i-1]

    promt_ids[promt_length+i] = next_id.item()
    print(promt_ids)

[3, 7, 5, 1, 1, 1, 1]
[3, 7, 5, 12, 1, 1, 1]
[3, 7, 5, 12, 10, 1, 1]
[3, 7, 5, 12, 10, 4, 1]
[3, 7, 5, 12, 10, 4, 12]


In [18]:
print(tokenizer.get_vocab())

{'ăn': 14, 'nhớ': 9, '<sos_topic2>': 3, 'cây': 6, 'thì': 12, '<sos_topic1>': 2, '<pad>': 1, 'nên': 10, 'chí': 5, '<eos>': 4, '<unk>': 0, 'có': 7, 'quả': 11, 'trồng': 13, 'kẻ': 8}
