## Data

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
corpus = [
    "i like a dog",
    "books are expensive"    
]
data_size = len(corpus)

# 0: noun/pronoun - 1: verb - others - 2
labels = [[0, 1, 2, 0],
          [0, 1, 2]]
sequence_length = 4
vocab_size = len(tokenizer)



In [2]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        super().__init__()
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        sentence_label = self.labels[idx]

        tokens = sentence.split()
        tokens = self.tokenizer.convert_tokens_to_ids(tokens)
        tokens, sentence_label = self.pad_and_truncate(tokens, sentence_label)

        return tokens, sentence_label
    
    def pad_and_truncate(self, tokens, labels): 
        
        pad_id = self.tokenizer.pad_token_id
        if len(tokens) < self.max_len:
            padded_tokens = tokens + [pad_id] * (self.max_len - len(tokens)) 
            padded_labels = labels + [3] * (self.max_len - len(tokens))           
        else:
            padded_tokens = tokens[:self.max_len]
            padded_labels = labels[:self.max_len]
            
        return torch.tensor(padded_tokens), torch.tensor(padded_labels)

In [3]:
from torch.utils.data import DataLoader

batch_size = 2
train_set = MyDataset(corpus, labels, tokenizer, sequence_length)
train_loader = DataLoader(train_set, batch_size)

In [4]:
for x,y in train_loader:
    print(x)
    print(y)
    print()

tensor([[1045, 2066, 1037, 3899],
        [2808, 2024, 6450,    0]])
tensor([[0, 1, 2, 0],
        [0, 1, 2, 3]])



## Model

In [5]:
class POS_Model(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 5)
        self.fc = nn.Linear(5, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.fc(x)
        return x.permute(0, 2, 1)

model = POS_Model(vocab_size, 4)

# test
output = model(x)
print(output.shape)

torch.Size([2, 4, 4])


## Train with full data

In [6]:
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [7]:
for _ in range(30):
    optimizer.zero_grad()
    outputs = model(x)
    loss = criterion(outputs, y)
    print(loss.item())
    loss.backward()
    optimizer.step()

1.5027673244476318
1.4699853658676147
1.438083529472351
1.40706467628479
1.376928687095642
1.3476636409759521
1.3192418813705444
1.2916191816329956
1.264732003211975
1.2385015487670898
1.2128398418426514
1.1876558065414429
1.1628614664077759
1.1383739709854126
1.1141186952590942
1.0900306701660156
1.0660560131072998
1.0421522855758667
1.0182883739471436
0.9944431185722351
0.970605194568634
0.9467710852622986
0.922943651676178
0.899131715297699
0.8753486275672913
0.8516116142272949
0.8279396295547485
0.8043543696403503
0.7808781862258911
0.7575339078903198


In [8]:
outputs = model(x)
o_softmax = torch.softmax(outputs, axis=1)

print(o_softmax[0, :, 0])
print(o_softmax[0, :, 1])
print(o_softmax[0, :, 2])
print(o_softmax[0, :, 3])

print()
print(o_softmax[1, :, 0])
print(o_softmax[1, :, 1])
print(o_softmax[1, :, 2])
print(o_softmax[1, :, 3])

tensor([0.6061, 0.0861, 0.1472, 0.1606], grad_fn=<SelectBackward0>)
tensor([0.0914, 0.4173, 0.3814, 0.1100], grad_fn=<SelectBackward0>)
tensor([0.2779, 0.2460, 0.3520, 0.1240], grad_fn=<SelectBackward0>)
tensor([0.4769, 0.2364, 0.1476, 0.1390], grad_fn=<SelectBackward0>)

tensor([0.3557, 0.1594, 0.2711, 0.2138], grad_fn=<SelectBackward0>)
tensor([0.0786, 0.7208, 0.1249, 0.0757], grad_fn=<SelectBackward0>)
tensor([0.0522, 0.2527, 0.5380, 0.1571], grad_fn=<SelectBackward0>)
tensor([0.6408, 0.1012, 0.1786, 0.0793], grad_fn=<SelectBackward0>)


In [9]:
# [[0, 1, 2, 0],
#  [0, 1, 2, *]]

o_softmax.argmax(axis=1)

tensor([[0, 1, 2, 0],
        [0, 1, 2, 0]])