## Data

In [1]:
import torch
import torch.nn as nn
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "ăn quả nhớ kẻ trồng cây",
    "có chí thì nên"    
]
data_size = len(corpus)

# Define the max vocabulary size and sequence length

sequence_length = 7

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab_size = 14
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>", 
                                            "<sos>", "<eos>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'ăn': 13,
 'nên': 9,
 'nhớ': 8,
 'trồng': 12,
 '<eos>': 3,
 'kẻ': 7,
 'thì': 11,
 'có': 6,
 'quả': 10,
 '<sos>': 2,
 'chí': 4,
 '<unk>': 0,
 'cây': 5,
 '<pad>': 1}

In [3]:
data_x = []
data_y = []

corpus[0] = '<sos> ' + corpus[0] + ' <eos>'
corpus[1] = '<sos> ' + corpus[1] + ' <eos>'

for vector in corpus:
    vector = vector.split()
    data_x.append(vector[:-1])
    data_y.append(vector[1:])  

print(data_x)
print(data_y)

[['<sos>', 'ăn', 'quả', 'nhớ', 'kẻ', 'trồng', 'cây'], ['<sos>', 'có', 'chí', 'thì', 'nên']]
[['ăn', 'quả', 'nhớ', 'kẻ', 'trồng', 'cây', '<eos>'], ['có', 'chí', 'thì', 'nên', '<eos>']]


In [4]:
# Tokenize and numericalize your samples
def vectorize(x, y, vocab, sequence_length):    
    x_ids = [vocab[token] for token in x][:sequence_length]
    x_ids = x_ids + [vocab["<pad>"]] * (sequence_length - len(x))

    y_ids = [vocab[token] for token in y][:sequence_length]
    y_ids = y_ids + [vocab["<pad>"]] * (sequence_length - len(y))
    
    return x_ids, y_ids

# Vectorize the samples
data_x_ids = []
data_y_ids = []
for x, y in zip(data_x, data_y):
    x_ids, y_ids = vectorize(x, y, vocab, sequence_length)
    data_x_ids.append(x_ids)
    data_y_ids.append(y_ids)

In [5]:
for x, y in zip(data_x_ids, data_y_ids):
    print(x)
    print(y)
    print()

[2, 13, 10, 8, 7, 12, 5]
[13, 10, 8, 7, 12, 5, 3]

[2, 6, 4, 11, 9, 1, 1]
[6, 4, 11, 9, 3, 1, 1]



In [6]:
data_x_ids = torch.tensor(data_x_ids, dtype=torch.long)
print(data_x_ids.shape)

torch.Size([2, 7])


In [7]:
data_y_ids = torch.tensor(data_y_ids, dtype=torch.long)
print(data_y_ids.shape)

torch.Size([2, 7])


## Train with full data

In [8]:
class TG_Model(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.recurrent = nn.RNN(embed_dim, embed_dim, batch_first=True)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.recurrent(x)        
        x = self.linear(x)
        return x.permute(0,2,1)

model = TG_Model(vocab_size, 8)
print(model)

TG_Model(
  (embedding): Embedding(14, 8)
  (recurrent): RNN(8, 8, batch_first=True)
  (linear): Linear(in_features=8, out_features=14, bias=True)
)


In [9]:
outputs = model(data_x_ids)
print(outputs.shape)

torch.Size([2, 14, 7])


In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [11]:
# train
for _ in range(40):
    optimizer.zero_grad()
    outputs = model(data_x_ids)
    loss = criterion(outputs, data_y_ids)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.6816394329071045
2.351379632949829
2.0741467475891113
1.8326606750488281
1.6047192811965942
1.3836381435394287
1.180101752281189
0.9993332624435425
0.8402460217475891
0.7022632360458374
0.5864279866218567
0.4909496307373047
0.4129471480846405
0.3500601649284363
0.3000575006008148
0.2608787715435028
0.2304748147726059
0.2068011462688446
0.18811772763729095
0.17314665019512177
0.16095659136772156
0.15092553198337555
0.14283910393714905
0.1365417242050171
0.1316073089838028
0.12765765190124512
0.12445992976427078
0.121831014752388
0.11961207538843155
0.11769706010818481
0.11604370921850204
0.11464127153158188
0.11346491426229477
0.11246101558208466
0.11157248914241791
0.11077065020799637
0.11005816608667374
0.1094428300857544
0.10891278088092804
0.10843981802463531


In [12]:
outputs = model(data_x_ids)
#print(outputs)
print(torch.argmax(outputs, axis=1))

tensor([[ 6, 10,  8,  7, 12,  5,  3],
        [ 6,  4, 11,  9,  3,  1,  1]])


In [13]:
data_y_ids

tensor([[13, 10,  8,  7, 12,  5,  3],
        [ 6,  4, 11,  9,  3,  1,  1]])

## Inference

In [14]:
promt = '<sos> có'
promt = promt.split()
promt_ids = [vocab[token] for token in promt][:sequence_length]
promt_ids = promt_ids + [vocab["<pad>"]] * (sequence_length - len(promt))

print(promt_ids)

[2, 6, 1, 1, 1, 1, 1]


In [15]:
for i in range(sequence_length - len(promt)):
    promt_tensor = torch.tensor(promt_ids, dtype=torch.long).reshape(1, -1)
    outputs = model(promt_tensor)
    outputs = torch.argmax(outputs, axis=1)   
    next_id = outputs[0][len(promt)+i-1]

    promt_ids[len(promt)+i] = next_id.item()
    print(promt_ids)

[2, 6, 4, 1, 1, 1, 1]
[2, 6, 4, 11, 1, 1, 1]
[2, 6, 4, 11, 9, 1, 1]
[2, 6, 4, 11, 9, 3, 1]
[2, 6, 4, 11, 9, 3, 5]
