## Data

In [1]:
import torch
import torch.nn as nn
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i like cats and dogs",
    "books are expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 0, 2, 0],
          [0, 1, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 10
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'dogs': 6,
 'books': 4,
 'like': 9,
 'i': 8,
 'cats': 5,
 'are': 3,
 'expensive': 7,
 'and': 2,
 '<pad>': 1,
 '<unk>': 0}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [3] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([8, 9, 5, 2, 6])
tensor([4, 3, 7, 1, 1])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 1, 0, 2, 0])
tensor([0, 1, 2, 3, 3])


## Model

In [6]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 3)        
        self.conv1d = nn.Conv1d(3, num_classes, 
                                kernel_size=3, 
                                padding='same')

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        return x

num_classes = 4
model = POS_Model(vocab_size, num_classes)

# test
data = torch.tensor([[3, 2, 3, 2, 1]]).long()
output = model(data)
print(output.shape)

torch.Size([1, 4, 5])


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [8]:
input_data = torch.tensor( [[8, 9, 5, 2, 6],
                            [4, 3, 7, 1, 1]], dtype=torch.long)
label_data = torch.tensor([[0, 1, 0, 2, 0],
                           [0, 1, 2, 3, 3]], dtype=torch.long)

for _ in range(30):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.6360236406326294
1.2038403749465942
0.9081104397773743
0.6820070743560791
0.49254974722862244
0.3340511918067932
0.21083757281303406
0.12564197182655334
0.07316330820322037
0.04241782799363136
0.02442837506532669
0.013938802294433117
0.007947552017867565
0.004590289667248726
0.0027156055439263582
0.0016562764067202806
0.0010442477650940418
0.0006807268364354968
0.0004583912086673081
0.00031833001412451267
0.00022759025159757584
0.0001671545469434932
0.00012588761455845088
9.707371646072716e-05
7.640858530066907e-05
6.141961785033345e-05
5.022981349611655e-05
4.181126496405341e-05
3.528495290083811e-05
3.0278421036200598e-05


In [9]:
outputs = model(input_data)
# outputs: (N, C, L)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[9.9994481e-01 5.2686952e-05 3.6694772e-07 2.1535823e-06]
[7.4360424e-09 9.9994612e-01 4.7608472e-05 6.2817148e-06]
[1.0000000e+00 4.7898517e-08 8.1678254e-11 4.4654758e-08]
[2.1330639e-08 1.0156863e-08 9.9996078e-01 3.9259608e-05]
[9.9999893e-01 8.8653377e-07 2.8536473e-08 2.7780936e-07]

[9.999918e-01 2.462372e-06 4.567031e-07 5.243405e-06]
[1.0149118e-05 9.9997079e-01 2.5749755e-06 1.6418842e-05]
[1.2613095e-06 3.8082185e-06 9.9997675e-01 1.8175289e-05]
[9.7893411e-01 1.3944202e-05 1.5013643e-03 1.9550666e-02]
[9.98030961e-01 1.86652597e-03 1.09709913e-06 1.01510486e-04]


In [10]:
# labels = [[0, 1, 0, 2, 0],
#           [0, 1, 2, *, *]]

o_softmax.argmax(axis=1)

tensor([[0, 1, 0, 2, 0],
        [0, 1, 2, 0, 0]])