## Data

In [1]:
import torch
import torch.nn as nn
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i like a dog",
    "books are expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 2, 0],
          [0, 1, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 9
sequence_length = 4

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'dog': 5,
 'books': 4,
 'like': 8,
 'i': 7,
 'are': 3,
 'a': 2,
 'expensive': 6,
 '<pad>': 1,
 '<unk>': 0}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [3] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([7, 8, 2, 5])
tensor([4, 3, 6, 1])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 1, 2, 0])
tensor([0, 1, 2, 3])


## Model

In [6]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 3)        
        self.conv1d = nn.Conv1d(3, num_classes, 
                                kernel_size=3, 
                                padding='same')

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        return x

num_classes = 4
model = POS_Model(vocab_size, num_classes)

# test
input = torch.tensor([[7, 8, 2, 5]], dtype=torch.long)
output = model(input)
print(output.shape)

torch.Size([1, 4, 4])


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [8]:
input_data = torch.tensor( [[7, 8, 2, 5],
                            [4, 3, 6, 1]], dtype=torch.long)
label_data = torch.tensor([[0, 1, 2, 0],
                           [0, 1, 2, 3]], dtype=torch.long)

for _ in range(50):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.3694839477539062
0.8195134401321411
0.4418118894100189
0.2108626365661621
0.09213379770517349
0.03880544751882553
0.016425946727395058
0.007170593831688166
0.0032705548219382763
0.001566659426316619
0.0007888965192250907
0.00041703274473547935
0.00023101909027900547
0.00013371474051382393
8.06455864221789e-05
5.059254908701405e-05
3.291735993116163e-05
2.215522908954881e-05
1.5343610357376747e-05
1.1001143320754636e-05
8.00395901023876e-06
6.0285306062723976e-06
4.632100171875209e-06
3.6103185721003683e-06
2.9461591566359857e-06
2.3841769234422827e-06
1.9754620552703273e-06
1.6689255062374286e-06
1.3964485106043867e-06
1.2602100696312846e-06
1.07288167328079e-06
9.196131713906652e-07
8.685236707606236e-07
7.663444989702839e-07
7.152549414968234e-07
6.641653271799441e-07
6.301056600932498e-07
5.449562650028383e-07
5.108964842293062e-07
4.93866650685959e-07
4.768367887209024e-07
4.598069267558458e-07
4.4277706479078915e-07
4.4277706479078915e-07
4.2574720282573253e-07
4.087173124389665

In [10]:
outputs = model(input_data)
# outputs: (N, C, L)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())

[9.9999988e-01 1.6069235e-07 5.2317120e-09 3.4500530e-08]
[3.25665739e-09 9.99999881e-01 1.92954253e-09 1.18086085e-07]
[8.8133534e-10 9.3091703e-09 9.9999976e-01 1.8393271e-07]
[9.9999845e-01 5.0635096e-10 1.4640083e-06 1.7258222e-07]

[9.9999988e-01 5.5934024e-08 1.0531438e-09 7.5882390e-08]
[4.418157e-09 9.999999e-01 4.356898e-10 8.937993e-08]
[4.1672257e-10 6.5373396e-09 1.0000000e+00 5.1108149e-08]
[9.9993849e-01 1.3678795e-09 7.2099656e-06 5.4301971e-05]


In [11]:
# [[0, 1, 2, 0],
#  [0, 1, 2, *]]

o_softmax.argmax(axis=1)

tensor([[0, 1, 2, 0],
        [0, 1, 2, 0]])