## Data

In [1]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

corpus = [
    "karpathy is working in openai",
    "geoffrey hinton is from canada"    
]
data_size = len(corpus)

# 0: B-Person - 1: I-Person
# 2: B-Organization/Location - 3: I--Organization/Location 
# 4: O
labels = [[0, 4, 4, 4, 2],
          [0, 1, 4, 4, 2]]

# Define the max vocabulary size and sequence length
vocab_size = 11
sequence_length = 5
num_classes = 5 + 1

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(corpus),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
vocab.get_stoi()

{'karpathy': 8,
 'hinton': 6,
 '<unk>': 0,
 'geoffrey': 5,
 'in': 7,
 '<pad>': 1,
 'from': 4,
 'is': 2,
 'canada': 3,
 'openai': 9,
 'working': 10}

In [3]:
# Tokenize and numericalize your samples
def vectorize(text, vocab, sequence_length, sequence_label):
    tokens = tokenizer(text)
    
    token_ids = [vocab[token] for token in tokens]    
    token_ids = token_ids + [vocab["<pad>"]] * (sequence_length - len(tokens))
    sequence_label = sequence_label + [5] * (sequence_length - len(tokens))
    
    return torch.tensor(token_ids, dtype=torch.long), torch.tensor(sequence_label, dtype=torch.long)

# Vectorize the samples
sentence_vecs = []
label_vecs = []
for sentence, labels in zip(corpus, labels):
    sentence_vec, labels_vec = vectorize(sentence, vocab, sequence_length, labels)
    sentence_vecs.append(sentence_vec)
    label_vecs.append(labels_vec)

In [4]:
for v in sentence_vecs:
    print(v)

tensor([ 8,  2, 10,  7,  9])
tensor([5, 6, 2, 4, 3])


In [5]:
for v in label_vecs:
    print(v)

tensor([0, 4, 4, 4, 2])
tensor([0, 1, 4, 4, 2])


## Model

In [6]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 4)
        self.recurrent = nn.RNN(4, 3, batch_first=True)
        self.fc = nn.Linear(3, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x,_ = self.recurrent(x)
        x = self.fc(x)
        return x.permute(0, 2, 1)

# create model
model = POS_Model(vocab_size, num_classes)

# test
data = torch.tensor([[0, 4, 4, 4, 2]]).long()
output = model(data)
print(output.shape)

torch.Size([1, 6, 5])


## Train with full data

In [7]:
criterion = nn.CrossEntropyLoss(ignore_index=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [8]:
input_data = torch.tensor( [[ 8,  2, 10,  7,  9],
                            [5, 6, 2, 4, 3]], dtype=torch.long)
label_data = torch.tensor([[0, 4, 4, 4, 2],
                           [0, 1, 4, 4, 2]], dtype=torch.long)

for _ in range(20):
    optimizer.zero_grad()
    outputs = model(input_data)
    loss = criterion(outputs, label_data)
    print(loss.item())
    loss.backward()
    optimizer.step()

2.1850037574768066
1.7266919612884521
1.4521758556365967
1.245767593383789
1.0752394199371338
0.9399515986442566
0.8289477229118347
0.7364404797554016
0.6654247641563416
0.6092787981033325
0.5596674084663391
0.5070559978485107
0.44396862387657166
0.3876616954803467
0.34720128774642944
0.30704933404922485
0.262486070394516
0.216573566198349
0.17488183081150055
0.140676349401474


In [9]:
outputs = model(input_data)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0].detach().numpy())
print(o_softmax[0, :, 1].detach().numpy())
print(o_softmax[0, :, 2].detach().numpy())
print(o_softmax[0, :, 3].detach().numpy())
print(o_softmax[0, :, 4].detach().numpy())

print()
print(o_softmax[1, :, 0].detach().numpy())
print(o_softmax[1, :, 1].detach().numpy())
print(o_softmax[1, :, 2].detach().numpy())
print(o_softmax[1, :, 3].detach().numpy())
print(o_softmax[1, :, 4].detach().numpy())

[0.86519384 0.00586162 0.09146995 0.01885612 0.0031571  0.01546139]
[7.3300733e-04 9.3245566e-02 1.2100055e-03 2.5279010e-03 9.0151083e-01
 7.7270193e-04]
[2.9968494e-03 1.1367687e-02 1.3514674e-02 8.8797894e-04 9.7103012e-01
 2.0271035e-04]
[4.3400344e-03 5.7571558e-03 2.6592005e-02 6.0904777e-04 9.6257532e-01
 1.2650219e-04]
[7.6904841e-02 3.1600593e-04 8.3876544e-01 1.2539926e-03 8.2295768e-02
 4.6403991e-04]

[0.8649206  0.00361764 0.10484723 0.01342196 0.00263019 0.01056243]
[0.08812151 0.75534433 0.0029293  0.04782962 0.07898337 0.02679189]
[2.2203813e-03 1.7921329e-02 8.1065921e-03 1.1076387e-03 9.7037596e-01
 2.6817527e-04]
[2.3288252e-03 1.7419130e-02 8.6356374e-03 1.1145559e-03 9.7023195e-01
 2.6989699e-04]
[7.3620141e-02 3.0539682e-04 8.4073490e-01 1.2185416e-03 8.3671786e-02
 4.4919652e-04]


In [10]:
# labels = [[0, 4, 4, 4, 2],
#           [0, 1, 4, 4, 2]]

o_softmax.argmax(axis=1)

tensor([[0, 4, 4, 4, 2],
        [0, 1, 4, 4, 2]])