## Data

In [10]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "Bà còng đi chợ trời mưa",
    "cái tôm cái tép đi đưa bà còng",
    # "Đưa bà tới quãng đường cong",
    # "Đưa bà vào tận ngõ trong nhà bà",
    # "Tiền bà trong túi rơi ra",
    # "Tép tôm nhặt được trả bà mua rau"
]
data_size = len(corpus)

# Define the max vocabulary size and sequence length
vocab_size = 10
sequence_length = 4

In [11]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                 )
vocab.get_stoi()

{'tôm': 8,
 'trời': 6,
 'tép': 7,
 'mưa': 5,
 'chợ': 4,
 'đi': 3,
 'còng': 2,
 'cái': 1,
 'đưa': 9,
 'bà': 0}

In [12]:
# Tokenize and numericalize your samples
def vectorize(text, vocab):
    tokens = tokenizer(text)
    token_ids = [vocab[token] for token in tokens]
    return token_ids

# Vectorize the samples
corpus_ids = []
for sentence in corpus:
    corpus_ids.extend(vectorize(sentence, vocab))

In [13]:
corpus_ids

[0, 2, 3, 4, 6, 5, 1, 8, 1, 7, 3, 9, 0, 2]

In [14]:
data_x = []
data_y = []
for idx in range(0, len(corpus_ids) - sequence_length):
    x = corpus_ids[idx:idx + sequence_length]
    y = corpus_ids[idx+1: idx + sequence_length + 1]

    data_x.append(x)
    data_y.append(y)
    
    print("Input: ", x)
    print("Target: ", y)
    print("#"*10)

Input:  [0, 2, 3, 4]
Target:  [2, 3, 4, 6]
##########
Input:  [2, 3, 4, 6]
Target:  [3, 4, 6, 5]
##########
Input:  [3, 4, 6, 5]
Target:  [4, 6, 5, 1]
##########
Input:  [4, 6, 5, 1]
Target:  [6, 5, 1, 8]
##########
Input:  [6, 5, 1, 8]
Target:  [5, 1, 8, 1]
##########
Input:  [5, 1, 8, 1]
Target:  [1, 8, 1, 7]
##########
Input:  [1, 8, 1, 7]
Target:  [8, 1, 7, 3]
##########
Input:  [8, 1, 7, 3]
Target:  [1, 7, 3, 9]
##########
Input:  [1, 7, 3, 9]
Target:  [7, 3, 9, 0]
##########
Input:  [7, 3, 9, 0]
Target:  [3, 9, 0, 2]
##########


## Train with full data

In [15]:
class POS_Model(nn.Module):
    def __init__(self, emb_dim, hidden_dim, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.custom_weights = torch.tensor([[-0.1,  0.5],
                                            [ 1.7, -0.8],
                                            [ 1.0, -1.9],
                                            [-1.3, -0.1],
                                            [ 0.2,  1.3],
                                            [ 0.4, -0.6],
                                            [ 0.5,  0.1],
                                            [ 0.4, -1.3],
                                            [ 0.2, -0.2],
                                            [ 0.1, -1.6],
                                            ]).float()
        self.embedding.weight = nn.Parameter(self.custom_weights)

        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
        self.rnn.bias_ih_l0 = nn.Parameter(torch.tensor([0.4,  0.5, 0.2]).float())
        self.rnn.bias_hh_l0 = nn.Parameter(torch.tensor([ 0.1, -0.2, 0.1]).float())
        self.rnn.weight_ih_l0 = nn.Parameter(torch.tensor( [[-0.4,  0.1],
                                                            [ 0.4, -0.4],
                                                            [ 0.1, 0.2]]).float())
        self.rnn.weight_hh_l0 = nn.Parameter(torch.tensor( [[-0.5,  0.1, 0.1],
                                                            [-0.2, -0.2, 0.1],
                                                            [-0.2, -0.2, 0.2]]).float())

        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.fc.weight = nn.Parameter(torch.tensor([[-0.52,  0.25, -0.35],
                                                    [ 0.06, -0.16,  0.18],
                                                    [-0.52,  0.33,  0.02],
                                                    [-0.12, -0.57, -0.4],
                                                    [-0.18, -0.5, -0.01],
                                                    [ 0.35, -0.03,  0.46],
                                                    [-0.13,  0.16,  0.23],
                                                    [-0.99,  0.31, -0.24],
                                                    [-0.74, -0.25,  0.5],
                                                    [-0.44, -0.02, -0.39]]).float())
        self.fc.bias = nn.Parameter(torch.tensor([0.12, -0.21, -0.15, 
                                                -0.54,  0.36,  0.49,  
                                                0.49,  0.07, 0.23,  
                                                0.52]).float())

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, _ = self.rnn(embedded)
        x = self.fc(outputs)
        return x.permute(0, 2, 1)

emb_dim = 2
hidden_dim = 3
model = POS_Model(emb_dim, hidden_dim, vocab_size)
print(model)

POS_Model(
  (embedding): Embedding(10, 2)
  (rnn): RNN(2, 3, batch_first=True)
  (fc): Linear(in_features=3, out_features=10, bias=True)
)


In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [17]:
input_data = torch.tensor( [[0, 2, 3, 4]], dtype=torch.long)
label_data = torch.tensor([[2, 3, 4, 6]], dtype=torch.long)

for _ in range(1):
    optimizer.zero_grad()
    outputs = model(input_data)
    softmax = nn.Softmax(dim=1)
    softmax_outputs = softmax(outputs)
    print("softmax_outputs: ", softmax_outputs)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

softmax_outputs:  tensor([[[0.0707, 0.1232, 0.0664, 0.0707],
         [0.0821, 0.0511, 0.0893, 0.0752],
         [0.0622, 0.1000, 0.0501, 0.0623],
         [0.0422, 0.0274, 0.0610, 0.0454],
         [0.1168, 0.0724, 0.1413, 0.1297],
         [0.2157, 0.1051, 0.2231, 0.1710],
         [0.1553, 0.1444, 0.1397, 0.1406],
         [0.0548, 0.1420, 0.0418, 0.0664],
         [0.0935, 0.0927, 0.0729, 0.1270],
         [0.1067, 0.1415, 0.1145, 0.1116]]], grad_fn=<SoftmaxBackward0>)
2.572925090789795


In [18]:
outputs = model(input_data)
print(outputs)
print(torch.argmax(outputs, axis=1))
print(label_data)

tensor([[[-0.3289,  0.4112, -0.4012, -0.1740],
         [-0.2608, -0.4939, -0.1871, -0.2186],
         [-0.2838,  0.1343, -0.5939, -0.1417],
         [-0.8338, -0.7768, -0.3491, -0.7186],
         [ 0.1995, -0.0493,  0.5368,  0.3370],
         [ 0.7370,  0.2968,  0.6082,  0.6514],
         [ 0.5463,  0.5381,  0.3122,  0.6216],
         [-0.5983,  0.3866, -0.8628, -0.2612],
         [-0.1960, -0.2296, -0.3904,  0.1997],
         [ 0.0459,  0.5856,  0.1762,  0.2113]]], grad_fn=<PermuteBackward0>)
tensor([[5, 9, 5, 5]])
tensor([[2, 3, 4, 6]])
