## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i like cats and dogs",
    "books are expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 0, 2, 0],
          [0, 1, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 10
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'books': 4,
 '<unk>': 0,
 'expensive': 7,
 'dogs': 6,
 'and': 2,
 '<pad>': 1,
 'are': 3,
 'cats': 5,
 'i': 8,
 'like': 9}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [3] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([8, 9, 5, 2, 6])
tensor([4, 3, 7, 1, 1])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 1, 0, 2, 0])
tensor([0, 1, 2, 3, 3])


## Model

In [6]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 3)
        self.recurrent = nn.RNN(3, num_classes, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.recurrent(x)
        return x.permute(0, 2, 1)

num_classes = 4
model = POS_Model(vocab_size, num_classes)
print(model)

# test
data = torch.tensor([[3, 2, 3, 2, 1]]).long()
output = model(data)
print(output.shape)

POS_Model(
  (embedding): Embedding(10, 3)
  (recurrent): RNN(3, 4, batch_first=True)
)
torch.Size([1, 4, 5])


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.2)

In [8]:
input_data = torch.tensor( [[8, 9, 5, 2, 6],
                            [4, 3, 7, 1, 1]], dtype=torch.long)
label_data = torch.tensor([[0, 1, 0, 2, 0],
                           [0, 1, 2, 3, 3]], dtype=torch.long)

for _ in range(20):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.4273903369903564
1.0175745487213135
0.7693442702293396
0.6725095510482788
0.6197192072868347
0.5625689625740051
0.5095528364181519
0.47773879766464233
0.43436166644096375
0.3645094335079193
0.3482290506362915
0.34418296813964844
0.34248170256614685
0.3416934609413147
0.3413226902484894
0.341139554977417
0.3410395085811615
0.3409777879714966
0.3409355580806732
0.34090447425842285


In [9]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[0.7112322  0.09625781 0.09625485 0.09625515]
[0.09629406 0.7111547  0.09624638 0.09630489]
[0.71122783 0.09625426 0.09626369 0.09625424]
[0.0961752  0.09617013 0.7106047  0.09704998]
[0.7112343  0.09625539 0.09625524 0.09625509]

[0.7112344  0.09625528 0.09625518 0.09625514]
[0.09625517 0.7112312  0.09625825 0.09625545]
[0.09625809 0.0962548  0.7112322  0.09625488]
[0.71106    0.09647577 0.09623219 0.09623206]
[0.45365342 0.22276007 0.16194633 0.1616402 ]


In [10]:
# [[0, 1, 0, 2, 0],
#  [0, 1, 2, *, *]]

o_softmax.argmax(axis=1)

tensor([[0, 1, 0, 2, 0],
        [0, 1, 2, 0, 0]])