## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "karpathy is working in openai",
    "geoffrey hinton is from canada"    
]
data_size = len(corpus)

# 0: B-Person - 1: I-Person
# 2: B-Organization/Location - 3: I--Organization/Location 
# 4: O
labels = [[0, 4, 4, 4, 2],
          [0, 1, 4, 4, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 11
sequence_length = 5
num_classes = 5 + 1

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'karpathy': 8,
 'hinton': 6,
 '<unk>': 0,
 'geoffrey': 5,
 'in': 7,
 '<pad>': 1,
 'from': 4,
 'is': 2,
 'canada': 3,
 'openai': 9,
 'working': 10}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [5] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([ 8,  2, 10,  7,  9])
tensor([5, 6, 2, 4, 3])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 4, 4, 4, 2])
tensor([0, 1, 4, 4, 2])


## Model

In [6]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 4)
        self.recurrent = nn.RNN(4, num_classes, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.recurrent(x)
        return x.permute(0, 2, 1)

# create model
model = POS_Model(vocab_size, num_classes)

# test
data = torch.tensor([[0, 4, 4, 4, 2]]).long()
output = model(data)
print(output.shape)

torch.Size([1, 6, 5])


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss(ignore_index=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.2)

In [8]:
input_data = torch.tensor( [[ 8,  2, 10,  7,  9],
                            [5, 6, 2, 4, 3]], dtype=torch.long)
label_data = torch.tensor([[0, 4, 4, 4, 2],
                           [0, 1, 4, 4, 2]], dtype=torch.long)

for _ in range(25):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.5540754795074463
0.992785632610321
0.8469835519790649
0.767238974571228
0.7097616195678711
0.6765373945236206
0.6551170349121094
0.6426442861557007
0.6313241720199585
0.6123499274253845
0.5849024653434753
0.565089762210846
0.5601094961166382
0.5592330694198608
0.5589781403541565
0.5588582158088684
0.5587877631187439
0.55874103307724
0.5587064623832703
0.5586787462234497
0.5586548447608948
0.5586336851119995
0.5586148500442505
0.5585978627204895
0.5585826635360718


In [9]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[0.5963922  0.08071621 0.08074266 0.08071624 0.08071621 0.08071639]
[0.08088248 0.0808358  0.08079487 0.08079284 0.5959011  0.08079284]
[0.08077803 0.08077841 0.08078013 0.08077817 0.59610724 0.080778  ]
[0.0807356  0.08072975 0.08072971 0.08073109 0.5963441  0.08072971]
[0.08071258 0.08071258 0.5963898  0.08075981 0.08071267 0.08071258]

[0.59641546 0.08071642 0.08071858 0.08071642 0.08071642 0.08071666]
[0.05325348 0.39349297 0.0532536  0.05325349 0.39349297 0.05325348]
[0.08071695 0.08071681 0.08071682 0.08071681 0.5964158  0.08071681]
[0.08071821 0.08076344 0.08071305 0.08071308 0.5963792  0.08071305]
[0.08071623 0.08071623 0.5964168  0.08071821 0.08071623 0.08071623]


In [10]:
# labels = [[0, 4, 4, 4, 2],
#           [0, 1, 4, 4, 2]]

o_softmax.argmax(axis=1)

tensor([[0, 4, 4, 4, 2],
        [0, 1, 4, 4, 2]])