## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "i like cats and dogs",
    "books are expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 0, 2, 0],
          [0, 1, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 10
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'books': 4,
 '<unk>': 0,
 'expensive': 7,
 'dogs': 6,
 'and': 2,
 '<pad>': 1,
 'are': 3,
 'cats': 5,
 'i': 8,
 'like': 9}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [3] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([8, 9, 5, 2, 6])
tensor([4, 3, 7, 1, 1])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 1, 0, 2, 0])
tensor([0, 1, 2, 3, 3])


## Model

In [6]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 3)        
        self.conv1d = nn.Conv1d(3, num_classes, kernel_size=2, padding='same')

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        return x

num_classes = 4
model = POS_Model(vocab_size, num_classes)
print(model)

# test
data = torch.tensor([[3, 2, 3, 2, 1]]).long()
output = model(data)
print(output.shape)

POS_Model(
  (embedding): Embedding(10, 3)
  (conv1d): Conv1d(3, 4, kernel_size=(2,), stride=(1,), padding=same)
)
torch.Size([1, 4, 5])


  return F.conv1d(input, weight, bias, self.stride,


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [8]:
input_data = torch.tensor( [[8, 9, 5, 2, 6],
                            [4, 3, 7, 1, 1]], dtype=torch.long)
label_data = torch.tensor([[0, 1, 0, 2, 0],
                           [0, 1, 2, 3, 3]], dtype=torch.long)

for _ in range(15):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.4303059577941895
1.0821889638900757
0.8254757523536682
0.603065013885498
0.4072841703891754
0.2514585256576538
0.143727645277977
0.07849040627479553
0.04217911511659622
0.02268858626484871
0.012367569841444492
0.006913817021995783
0.004004978109151125
0.0024216610472649336
0.0015347135486081243


In [9]:
outputs = model(input_data)
# outputs: (N, C, L)
o_softmax = torch.softmax(outputs, axis=1)
print(o_softmax[0, :, 0].detach().numpy())







print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[9.9989522e-01 4.6947275e-06 6.1434424e-05 3.8712671e-05]
[2.3952198e-05 9.9855107e-01 1.3405434e-03 8.4346801e-05]
[9.9796885e-01 2.1413396e-04 1.2553470e-04 1.6914604e-03]
[3.0274030e-06 8.5475628e-04 9.9914038e-01 1.7631862e-06]
[9.97780263e-01 2.19874107e-03 1.06513335e-05 1.03553875e-05]

[9.9999917e-01 6.1268611e-07 2.4414143e-07 2.9376526e-08]
[9.8818717e-05 9.9870360e-01 4.2762927e-06 1.1933001e-03]
[3.2247015e-05 1.1721413e-04 9.9980146e-01 4.9104656e-05]
[8.7581277e-01 9.6738786e-03 1.1450429e-01 9.0944468e-06]
[9.9461752e-01 5.2106599e-03 9.1189089e-05 8.0600825e-05]


In [10]:
# [[0, 1, 0, 2, 0],
#  [0, 1, 2, 3, 3]]

o_softmax.argmax(axis=1)

tensor([[0, 1, 0, 2, 0],
        [0, 1, 2, 0, 0]])