## Data

In [1]:
import torch
import torch.nn as nn
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "ăn quả nhớ kẻ trồng cây",
    "có chí thì nên"    
]
data_size = len(corpus)

# Define the max vocabulary size and sequence length
vocab_size = 15
sequence_length = 7

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>", 
                                            "<sos_topic1>", 
                                            "<sos_topic2>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'ăn': 14,
 'quả': 11,
 'nên': 10,
 'nhớ': 9,
 'trồng': 13,
 '<eos>': 4,
 'kẻ': 8,
 'chí': 5,
 '<unk>': 0,
 'cây': 6,
 '<sos_topic2>': 3,
 'thì': 12,
 'có': 7,
 '<sos_topic1>': 2,
 '<pad>': 1}

In [3]:
data_x = []
data_y = []

corpus[0] = '<sos_topic1> ' + corpus[0] + ' <eos>'
corpus[1] = '<sos_topic2> ' + corpus[1] + ' <eos>'

for vector in corpus:
    vector = vector.split()
    data_x.append(vector[:-1])
    data_y.append(vector[1:])  

print(data_x)
print(data_y)

[['<sos_topic1>', 'ăn', 'quả', 'nhớ', 'kẻ', 'trồng', 'cây'], ['<sos_topic2>', 'có', 'chí', 'thì', 'nên']]
[['ăn', 'quả', 'nhớ', 'kẻ', 'trồng', 'cây', '<eos>'], ['có', 'chí', 'thì', 'nên', '<eos>']]


In [4]:
# Tokenize and numericalize your samples
def vectorize(x, y, vocab, sequence_length):    
    x_ids = [vocab[token] for token in x][:sequence_length]
    x_ids = x_ids + [vocab["<pad>"]] * (sequence_length - len(x))

    y_ids = [vocab[token] for token in y][:sequence_length]
    y_ids = y_ids + [vocab["<pad>"]] * (sequence_length - len(y))
    
    return x_ids, y_ids

# Vectorize the samples
data_x_ids = []
data_y_ids = []
for x, y in zip(data_x, data_y):
    x_ids, y_ids = vectorize(x, y, vocab, sequence_length)
    data_x_ids.append(x_ids)
    data_y_ids.append(y_ids)

In [5]:
for x, y in zip(data_x_ids, data_y_ids):
    print(x)
    print(y)
    print()

[2, 14, 11, 9, 8, 13, 6]
[14, 11, 9, 8, 13, 6, 4]

[3, 7, 5, 12, 10, 1, 1]
[7, 5, 12, 10, 4, 1, 1]



In [6]:
data_x_ids = torch.tensor(data_x_ids, dtype=torch.long)
print(data_x_ids.shape)

torch.Size([2, 7])


In [7]:
data_y_ids = torch.tensor(data_y_ids, dtype=torch.long)
print(data_y_ids.shape)

torch.Size([2, 7])


## Train with full data

In [8]:
class TG_Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, sequence_length):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim)
        self.mask = torch.triu(torch.ones(sequence_length, sequence_length), 
                               diagonal=1).bool()
        self.transformer = nn.TransformerEncoderLayer(d_model=embed_dim, 
                                                       nhead=num_heads, 
                                                       batch_first=True, 
                                                       dim_feedforward=4,
                                                       dropout=0.0, 
                                                       bias=False)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)                        # [n, seq_len, embed_dim]
        x = self.transformer(x, src_mask=self.mask)  # [n, seq_len, embed_dim]        
        x = self.linear(x)                           # [n, seq_len, vocab_size]
        return x.permute(0,2,1)                      # [n, vocab_size, seq_len]

model = TG_Model(vocab_size, 8, 1, sequence_length)
print(model)

TG_Model(
  (embedding): Embedding(15, 8)
  (transformer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=False)
    )
    (linear1): Linear(in_features=8, out_features=4, bias=False)
    (dropout): Dropout(p=0.0, inplace=False)
    (linear2): Linear(in_features=4, out_features=8, bias=False)
    (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.0, inplace=False)
    (dropout2): Dropout(p=0.0, inplace=False)
  )
  (linear): Linear(in_features=8, out_features=15, bias=True)
)


In [9]:
data_x_ids

tensor([[ 2, 14, 11,  9,  8, 13,  6],
        [ 3,  7,  5, 12, 10,  1,  1]])

In [10]:
outputs = model(data_x_ids)
print(outputs.shape)

torch.Size([2, 15, 7])


In [11]:
data_y_ids.shape

torch.Size([2, 7])

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [13]:
# train
for _ in range(40):
    optimizer.zero_grad()
    outputs = model(data_x_ids)
    loss = criterion(outputs, data_y_ids)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.730449914932251
2.1119322776794434
1.7062618732452393
1.3846019506454468
1.1238559484481812
0.8968600630760193
0.6988936066627502
0.53361576795578
0.3924574553966522
0.29346925020217896
0.21857666969299316
0.15060754120349884
0.1038026213645935
0.07041735202074051
0.0472949743270874
0.03342118114233017
0.02523312158882618
0.019233187660574913
0.014044022187590599
0.009988210164010525
0.007291042245924473
0.005603210534900427
0.004517455585300922
0.003768950467929244
0.003213154850527644
0.0027723386883735657
0.0024070756044238806
0.0021035862155258656
0.0018540607998147607
0.0016490528360009193
0.001476818579249084
0.0013284601736813784
0.0011994175147265196
0.001083663199096918
0.0009810917545109987
0.0008914769277907908
0.0008143031154759228
0.0007482896908186376
0.0006918405997566879
0.0006423891754820943


In [14]:
outputs = model(data_x_ids)
#print(outputs)
print(torch.argmax(outputs, axis=1))

tensor([[14, 11,  9,  8, 13,  6,  4],
        [ 7,  5, 12, 10,  4,  1,  1]])


In [15]:
data_y_ids

tensor([[14, 11,  9,  8, 13,  6,  4],
        [ 7,  5, 12, 10,  4,  1,  1]])

## Inference

In [16]:
promt = '<sos_topic2> có'
promt = promt.split()
promt_ids = [vocab[token] for token in promt][:sequence_length]
promt_ids = promt_ids + [vocab["<pad>"]] * (sequence_length - len(promt))

print(promt_ids)

[3, 7, 1, 1, 1, 1, 1]


In [17]:
for i in range(sequence_length - len(promt)):
    promt_tensor = torch.tensor(promt_ids, dtype=torch.long).reshape(1, -1)
    outputs = model(promt_tensor)
    outputs = torch.argmax(outputs, axis=1)   
    next_id = outputs[0][len(promt)+i-1]

    promt_ids[len(promt)+i] = next_id.item()
    print(promt_ids)

[3, 7, 5, 1, 1, 1, 1]
[3, 7, 5, 12, 1, 1, 1]
[3, 7, 5, 12, 10, 1, 1]
[3, 7, 5, 12, 10, 4, 1]
[3, 7, 5, 12, 10, 4, 8]


In [19]:
promt = '<sos_topic1>'
promt = promt.split()
promt_ids = [vocab[token] for token in promt][:sequence_length]
promt_ids = promt_ids + [vocab["<pad>"]] * (sequence_length - len(promt))

for i in range(sequence_length - len(promt)):
    promt_tensor = torch.tensor(promt_ids, dtype=torch.long).reshape(1, -1)
    outputs = model(promt_tensor)
    outputs = torch.argmax(outputs, axis=1)   
    next_id = outputs[0][len(promt)+i-1]

    promt_ids[len(promt)+i] = next_id.item()
print(promt_ids)

[2, 14, 11, 9, 8, 13, 6]


In [20]:
promt = '<sos_topic2>'
promt = promt.split()
promt_ids = [vocab[token] for token in promt][:sequence_length]
promt_ids = promt_ids + [vocab["<pad>"]] * (sequence_length - len(promt))

for i in range(sequence_length - len(promt)):
    promt_tensor = torch.tensor(promt_ids, dtype=torch.long).reshape(1, -1)
    outputs = model(promt_tensor)
    outputs = torch.argmax(outputs, axis=1)   
    next_id = outputs[0][len(promt)+i-1]

    promt_ids[len(promt)+i] = next_id.item()
print(promt_ids)

[3, 7, 5, 12, 10, 4, 8]
