## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "karpathy is working in openai",
    "geoffrey hinton is from canada"    
]
data_size = len(corpus)

# 0: B-Person - 1: I-Person
# 2: B-Organization/Location - 3: I--Organization/Location 
# 4: O
labels = [[0, 4, 4, 4, 2],
          [0, 1, 4, 4, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 11
sequence_length = 5
num_classes = 5 + 1

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'karpathy': 8,
 'hinton': 6,
 '<unk>': 0,
 'geoffrey': 5,
 'in': 7,
 '<pad>': 1,
 'from': 4,
 'is': 2,
 'canada': 3,
 'openai': 9,
 'working': 10}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [5] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([ 8,  2, 10,  7,  9])
tensor([5, 6, 2, 4, 3])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 4, 4, 4, 2])
tensor([0, 1, 4, 4, 2])


## Model

In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=embed_dim, 
                                          num_heads=num_heads)
        self.ffn = nn.Linear(in_features=embed_dim, 
                             out_features=ff_dim)
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim)

    def forward(self, query, key, value):
        attn_output, _ = self.attn(query, key, value)
        out_1 = self.layernorm_1(query + attn_output)
        ffn_output = self.ffn(out_1)
        x = self.layernorm_2(out_1 + ffn_output)
        return x

In [7]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 4)
        self.transformer = TransformerBlock(4, 1, 4)
        self.fc = nn.Linear(4, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x, x)
        x = self.fc(x)
        return x.permute(0, 2, 1)

# create model
model = POS_Model(vocab_size, num_classes)

# test
data = torch.tensor([[0, 4, 4, 4, 2]]).long()
output = model(data)
print(output.shape)

torch.Size([1, 6, 5])


## Train with full data

In [8]:
criterion = nn.CrossEntropyLoss(ignore_index=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.2)

In [9]:
input_data = torch.tensor( [[ 8,  2, 10,  7,  9],
                            [5, 6, 2, 4, 3]], dtype=torch.long)
label_data = torch.tensor([[0, 4, 4, 4, 2],
                           [0, 1, 4, 4, 2]], dtype=torch.long)

for _ in range(20):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.6467418670654297
1.3323087692260742
1.2814031839370728
1.1359691619873047
0.8551193475723267
0.7378385663032532
0.808885395526886
0.6745971441268921
0.7959946393966675
0.6109843850135803
0.5051301717758179
0.4285423159599304
0.3084316849708557
0.25852131843566895
0.24296411871910095
0.2354608029127121
0.14161622524261475
0.06275929510593414
0.02444736286997795
0.02680407091975212


In [10]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[8.4492511e-01 1.5470576e-01 1.2180508e-05 4.0215923e-06 3.4746790e-04
 5.4186257e-06]
[1.0262575e-08 2.7112910e-04 5.2790966e-12 3.0160552e-05 9.9966812e-01
 3.0567833e-05]
[1.5038705e-08 1.0977791e-03 6.9733867e-13 1.8743758e-05 9.9886334e-01
 2.0135329e-05]
[1.3520665e-08 8.8443304e-04 8.8877072e-13 2.0139692e-05 9.9907398e-01
 2.1438202e-05]
[1.2150423e-04 5.5672389e-10 9.9987853e-01 4.1220623e-08 6.6555579e-09
 3.4894388e-08]

[8.2452250e-01 1.7511739e-01 1.2364363e-05 5.0626413e-06 3.3582351e-04
 6.7838596e-06]
[6.03546202e-02 9.34617698e-01 1.14734995e-07 2.28721401e-05
 4.97437222e-03 3.04535642e-05]
[1.1804904e-08 6.5698958e-04 1.1678058e-12 2.0642156e-05 9.9930060e-01
 2.1760210e-05]
[1.1442326e-08 5.8199500e-04 1.4578233e-12 2.2444552e-05 9.9937207e-01
 2.3503861e-05]
[1.2278516e-04 8.1535029e-10 9.9987710e-01 6.9433149e-08 1.6996479e-08
 5.8558189e-08]


In [11]:
# labels = [[0, 4, 4, 4, 2],
#           [0, 1, 4, 4, 2]]

o_softmax.argmax(axis=1)

tensor([[0, 4, 4, 4, 2],
        [0, 1, 4, 4, 2]])