## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "karpathy is working in openai",
    "geoffrey hinton is from canada"    
]
data_size = len(corpus)

# 0: B-Person - 1: I-Person
# 2: B-Organization/Location - 3: I--Organization/Location 
# 4: O
labels = [[0, 4, 4, 4, 2],
          [0, 1, 4, 4, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 11
sequence_length = 5
num_classes = 5 + 1

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'karpathy': 8,
 'hinton': 6,
 '<unk>': 0,
 'geoffrey': 5,
 'in': 7,
 '<pad>': 1,
 'from': 4,
 'is': 2,
 'canada': 3,
 'openai': 9,
 'working': 10}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [5] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([ 8,  2, 10,  7,  9])
tensor([5, 6, 2, 4, 3])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 4, 4, 4, 2])
tensor([0, 1, 4, 4, 2])


## Model

In [6]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=embed_dim, 
                                          num_heads=num_heads)
        self.ffn = nn.Linear(in_features=embed_dim, 
                             out_features=ff_dim)
        self.layernorm_1 = nn.LayerNorm(normalized_shape=embed_dim)
        self.layernorm_2 = nn.LayerNorm(normalized_shape=embed_dim)

    def forward(self, query, key, value):
        attn_output, _ = self.attn(query, key, value)
        out_1 = self.layernorm_1(query + attn_output)
        ffn_output = self.ffn(out_1)
        x = self.layernorm_2(out_1 + ffn_output)
        return x

In [7]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, num_classes)
        self.transformer = TransformerBlock(num_classes, 1, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x, x, x)
        return x.permute(0, 2, 1)

# create model
model = POS_Model(vocab_size, num_classes)

# test
data = torch.tensor([[0, 4, 4, 4, 2]]).long()
output = model(data)
print(output.shape)

torch.Size([1, 6, 5])


## Train with full data

In [8]:
criterion = nn.CrossEntropyLoss(ignore_index=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [9]:
input_data = torch.tensor( [[ 8,  2, 10,  7,  9],
                            [5, 6, 2, 4, 3]], dtype=torch.long)
label_data = torch.tensor([[0, 4, 4, 4, 2],
                           [0, 1, 4, 4, 2]], dtype=torch.long)

for _ in range(20):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.176647901535034
1.2579545974731445
0.8909069299697876
0.7292181849479675
0.5935591459274292
0.4841291010379791
0.35433346033096313
0.29415062069892883
0.20907063782215118
0.16005021333694458
0.13340218365192413
0.11409139633178711
0.09243372082710266
0.06807409226894379
0.04909799247980118
0.03793274611234665
0.031193658709526062
0.02675251103937626
0.024363311007618904
0.022772256284952164


In [10]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[0.9756817  0.00139394 0.00349527 0.00403198 0.01074486 0.0046522 ]
[8.2704140e-04 1.6058859e-03 4.0207077e-03 1.1784609e-03 9.9082458e-01
 1.5432964e-03]
[9.3371462e-04 1.2889557e-03 6.3935486e-03 1.3201460e-03 9.8821729e-01
 1.8462961e-03]
[1.4030256e-03 1.9074753e-03 8.1392471e-04 1.0871281e-03 9.9338347e-01
 1.4048947e-03]
[7.3550397e-04 1.5737988e-03 9.6949559e-01 1.8598555e-03 2.3983318e-02
 2.3520275e-03]

[0.9780271  0.00289837 0.00140658 0.00385914 0.00923854 0.00457024]
[0.01393217 0.94279796 0.00303008 0.0076766  0.02512229 0.00744104]
[8.2704221e-04 1.6058790e-03 4.0207324e-03 1.1784609e-03 9.9082458e-01
 1.5432979e-03]
[3.5270965e-03 1.0653657e-03 5.3581141e-04 1.1586939e-03 9.9221414e-01
 1.4988666e-03]
[1.2639592e-03 9.2231645e-04 9.8921204e-01 1.3695023e-03 5.3987256e-03
 1.8333482e-03]


In [11]:
# labels = [[0, 4, 4, 4, 2],
#           [0, 1, 4, 4, 2]]

o_softmax.argmax(axis=1)

tensor([[0, 4, 4, 4, 2],
        [0, 1, 4, 4, 2]])