In [1]:
# utils 
import torch

# data
from torchtext.datasets import PennTreebank, UDPOS
from torchtext.data import Field, BucketIterator

# model
import torch.nn as nn
import torch.nn.functional as F

# training
import torch.optim as optim
import tqdm

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

### Data 

In [4]:
TEXT = Field(tokenize="spacy", lower=True, batch_first=True)
UD_TAGS = Field(unk_token=None, batch_first=True)
PTB_TAGS = Field(unk_token=None, batch_first=True)

In [5]:
# fields list
fields = [("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS)]

In [6]:
# pen dataset
train, val, test = UDPOS.splits(fields)

downloading en-ud-v2.zip


en-ud-v2.zip: 100%|██████████| 688k/688k [00:01<00:00, 641kB/s]


extracting


In [7]:
#### Build vocab and create data loader
TEXT.build_vocab(
    train, 
    min_freq=2,
    vectors = "glove.6B.100d",
    unk_init = torch.Tensor.normal_,
)
UD_TAGS.build_vocab(train, min_freq=1)
PTB_TAGS.build_vocab(train, min_freq=1)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                          
100%|█████████▉| 399483/400000 [00:16<00:00, 25408.72it/s]

In [8]:
BATCH_SIZE = 128

In [9]:
train_loader, val_loader, test_loader = BucketIterator.splits(
    datasets=(train, val, test),
    batch_size=BATCH_SIZE,
    device=device
)

In [10]:
for x in train_loader:
    print(x.text.shape)
    break

torch.Size([128, 74])


### Model

![title](./assets/pos-bidirectional-lstm.png)

In [11]:
### A multi-layer bidirection LSTM network
class POSTagger(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=4, dropout=0.2):
        
        super(POSTagger, self).__init__()
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim, 
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout = dropout,
            bidirectional=True
        )
        self.fc = nn.Linear(in_features=2*hidden_size, out_features=output_size)
        # 2* hidden_size -> bidirectional
    
    def forward(self, x):
        # x.shape -> [batch, seq_len]
        
        embedded =  self.embedding(x)
        # embedded.shape -> [batch, seq_len, embedding_dim]
        
        outputs, _ = self.lstm(embedded)
        
        predictions = F.softmax(self.fc(outputs), dim=1)
        
        return predictions
        
        

### Training

In [12]:
# accuracy calculator
def accuracy(y, y_):
    correct = 0
    for i in range(y.shape[0]):
        if y[i]==y_[i]:
            correct+=1
    return correct/y.shape[0]

In [13]:
# evaluation function
def eval(model, data, criterion):
    acc = []
    loss = []
    
    with torch.no_grad():
        for batch in  data:
            x, y = batch.text, batch.udtags
            outputs = model(x)
           
            # y.shape -> [batch, seq_len, vocab_size]
            bs, seq_len, vocab_size = outputs.size(0), outputs.size(1), outputs.size(2)
            
            # flatten the ground truth as well as predictions
            l = criterion(outputs.view(bs*seq_len, -1), y.view(-1))
            
            y_ =  torch.argmax(outputs, dim=2)           
        
            a = accuracy(y.view(-1), y_.view(-1))
            
            loss.append(l.item())
            acc.append(a)
    
    return sum(acc)/len(acc), sum(loss)/len(loss)

In [18]:
# create model
model = POSTagger(
    vocab_size=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=128,
    output_size=len(UD_TAGS.vocab)
).to(device)

In [19]:
# training configuration
EPOCHS = 10
lr = 0.01
PAD_TOKEN = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]

In [20]:
# create criterion and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN).to(device)
optimizer = optim.Adam(params=model.parameters(), lr=lr)

In [21]:
epoch_progress = tqdm.tqdm(total=EPOCHS, desc="Epoch", position=0)
steps = 0

for epoch in range(EPOCHS):
    epoch_loss = []
    epochs_acc = []
    for batch in train_loader:
        x, y = batch.text, batch.udtags
        outputs = model(x)
        
        bs, seq_len, vocab_size = outputs.size(0), outputs.size(1), outputs.size(2)
        
        # calculate loss and gradient and then backpropagate the gradient
        optimizer.zero_grad()
        loss = criterion(outputs.view(bs*seq_len, -1), y.view(-1))
        loss.backward()
        optimizer.step()
        
        y_ =  torch.argmax(outputs, dim=2)           
        acc = accuracy(y.view(-1), y_.view(-1))
        
        if steps % 50 == 0:
            print(f'Epochs {epoch} | Steps {steps} | Train_loss {loss.item():.4f} | Train_acc {acc:.4f}')
        epoch_loss.append(loss.item())
        epochs_acc.append(acc)
        steps +=1 
    
    val_acc, val_loss = eval(model, val_loader, criterion)
    avg_loss = sum(epoch_loss)/len(epoch_loss)
    avg_acc = sum(epochs_acc)/len(epochs_acc)
    
    print(f'Epoch {epoch} | Train_loss {avg_loss:.4f} | Train_acc {avg_acc:.4f} | Val_loss {val_loss:.4f} | Val_acc {val_acc:.4f}')
    
    epoch_progress.update(1)

        

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Epochs 0 | Steps 0 | Train_loss 2.8904 | Train_acc 0.0925
Epochs 0 | Steps 50 | Train_loss 2.6407 | Train_acc 0.1448


Epoch:  10%|█         | 1/10 [00:37<05:33, 37.01s/it]

Epoch 0 | Train_loss 2.6595 | Train_acc 0.1874 | Val_loss 2.4736 | Val_acc 0.5393
Epochs 1 | Steps 100 | Train_loss 2.5382 | Train_acc 0.3462
Epochs 1 | Steps 150 | Train_loss 2.5192 | Train_acc 0.1877


Epoch:  20%|██        | 2/10 [01:13<04:55, 36.88s/it]

Epoch 1 | Train_loss 2.5371 | Train_acc 0.2873 | Val_loss 2.4432 | Val_acc 0.5797
Epochs 2 | Steps 200 | Train_loss 2.5130 | Train_acc 0.3164
Epochs 2 | Steps 250 | Train_loss 2.5197 | Train_acc 0.2958


Epoch:  30%|███       | 3/10 [01:49<04:17, 36.74s/it]

Epoch 2 | Train_loss 2.5128 | Train_acc 0.2724 | Val_loss 2.4329 | Val_acc 0.5934
Epochs 3 | Steps 300 | Train_loss 2.4970 | Train_acc 0.2527
Epochs 3 | Steps 350 | Train_loss 2.5053 | Train_acc 0.3080


Epoch:  40%|████      | 4/10 [02:27<03:41, 36.96s/it]

Epoch 3 | Train_loss 2.5009 | Train_acc 0.2616 | Val_loss 2.4251 | Val_acc 0.6019
Epochs 4 | Steps 400 | Train_loss 2.5166 | Train_acc 0.2130
Epochs 4 | Steps 450 | Train_loss 2.4918 | Train_acc 0.2005


Epoch:  50%|█████     | 5/10 [03:04<03:04, 36.86s/it]

Epoch 4 | Train_loss 2.4958 | Train_acc 0.2286 | Val_loss 2.4218 | Val_acc 0.6058
Epochs 5 | Steps 500 | Train_loss 2.4934 | Train_acc 0.2907
Epochs 5 | Steps 550 | Train_loss 2.4846 | Train_acc 0.2299


Epoch:  60%|██████    | 6/10 [03:41<02:28, 37.07s/it]

Epoch 5 | Train_loss 2.4926 | Train_acc 0.2363 | Val_loss 2.4166 | Val_acc 0.6121
Epochs 6 | Steps 600 | Train_loss 2.4976 | Train_acc 0.2460
Epochs 6 | Steps 650 | Train_loss 2.4785 | Train_acc 0.2271


Epoch:  70%|███████   | 7/10 [04:17<01:50, 36.81s/it]

Epoch 6 | Train_loss 2.4905 | Train_acc 0.2451 | Val_loss 2.4208 | Val_acc 0.6067
Epochs 7 | Steps 700 | Train_loss 2.4850 | Train_acc 0.2095
Epochs 7 | Steps 750 | Train_loss 2.4998 | Train_acc 0.2372


Epoch:  80%|████████  | 8/10 [04:54<01:13, 36.71s/it]

Epoch 7 | Train_loss 2.4891 | Train_acc 0.2173 | Val_loss 2.4126 | Val_acc 0.6219
Epochs 8 | Steps 800 | Train_loss 2.4528 | Train_acc 0.2699
Epochs 8 | Steps 850 | Train_loss 2.5064 | Train_acc 0.1580


Epoch:  90%|█████████ | 9/10 [05:30<00:36, 36.63s/it]

Epoch 8 | Train_loss 2.4874 | Train_acc 0.2251 | Val_loss 2.4091 | Val_acc 0.6305
Epochs 9 | Steps 900 | Train_loss 2.4874 | Train_acc 0.2021
Epochs 9 | Steps 950 | Train_loss 2.4776 | Train_acc 0.2496


Epoch: 100%|██████████| 10/10 [06:07<00:00, 36.64s/it]

Epoch 9 | Train_loss 2.4867 | Train_acc 0.2173 | Val_loss 2.4123 | Val_acc 0.6268


In [22]:
eval(model, test_loader, criterion)

(0.6181387111742966, 2.467750409070183)