In [1]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

WORD = data.Field(init_token="<bos>", eos_token="<eos>", include_lengths=True)
UD_TAG = data.Field(init_token="<bos>", eos_token="<eos>", pad_token=None, unk_token=None)

CHAR_NESTING = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
CHAR = data.NestedField(CHAR_NESTING, init_token="<bos>", eos_token="<eos>")

In [2]:
import torch.nn as nn
import torch.nn.functional as F

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # self.hidden = self.init_hidden()

    # def init_hidden(self, batch_size):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        # return (torch.zeros(1, batch_size, self.hidden_dim),
        #         torch.zeros(1, batch_size, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        # print(embeds.shape)
        lstm_out, self.hidden = self.lstm(embeds)
        # print(lstm_out.shape)
        tag_space = self.hidden2tag(lstm_out)
        # print(tag_space.shape)
        return tag_space

In [3]:
from torchtext import datasets

train_data, valid_data, test_data = datasets.UDPOS.splits(
    fields=(('word', WORD), ('udtag', UD_TAG)))

downloading en-ud-v2.zip


en-ud-v2.zip:   0%|          | 0.00/688k [00:00<?, ?B/s]

en-ud-v2.zip:   5%|▍         | 32.8k/688k [00:00<00:02, 284kB/s]

en-ud-v2.zip:   7%|▋         | 49.2k/688k [00:00<00:02, 225kB/s]

en-ud-v2.zip:  14%|█▍        | 98.3k/688k [00:00<00:02, 261kB/s]

en-ud-v2.zip:  29%|██▊       | 197k/688k [00:00<00:01, 329kB/s] 

en-ud-v2.zip:  55%|█████▍    | 377k/688k [00:00<00:00, 425kB/s]

en-ud-v2.zip:  81%|████████  | 557k/688k [00:00<00:00, 548kB/s]

en-ud-v2.zip: 100%|██████████| 688k/688k [00:00<00:00, 959kB/s]




extracting


In [4]:
print(train_data.fields)
print(len(train_data))
print(vars(train_data[0]))

{'word': <torchtext.data.field.Field object at 0x7f4ac02fda90>, 'udtag': <torchtext.data.field.Field object at 0x7f4ac0257160>}
12543
{'word': ['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.'], 'udtag': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']}


In [277]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(val_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 12543
Number of validation examples: 2002
Number of testing examples: 2077


In [190]:
print(vars(train_data.examples[0]))

{'word': ['Al', '-', 'Zaman', ':', 'American', 'forces', 'killed', 'Shaikh', 'Abdullah', 'al', '-', 'Ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'Qaim', ',', 'near', 'the', 'Syrian', 'border', '.'], 'udtag': ['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']}


In [313]:
WORD.build_vocab(train_data)
UD_TAG.build_vocab(train_data)

In [299]:
print(f"Unique tokens in WORD vocabulary: {len(WORD.vocab)}")
print(f"Unique tokens in CHAR vocabulary: {len(CHAR.vocab)}")
print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAG.vocab)}")

Unique tokens in WORD vocabulary: 19676
Unique tokens in CHAR vocabulary: 112
Unique tokens in UD_TAG vocabulary: 19


In [300]:

print(CHAR.vocab.__dict__)

{'freqs': Counter(), 'itos': ['<unk>', '<pad>', '<bos>', '<eos>', 'e', 't', 'a', 'o', 'n', 'i', 's', 'r', 'h', 'l', 'd', 'u', 'c', 'm', 'y', 'f', 'g', 'w', 'p', '.', 'b', 'v', ',', 'k', 'I', '-', 'T', 'A', '0', 'S', "'", 'C', '1', 'E', 'M', 'P', '2', 'x', 'N', 'B', 'W', 'H', 'O', '"', 'D', 'R', '!', 'L', '/', ':', '3', 'j', 'F', ')', '?', 'G', 'q', '(', '5', 'U', '4', '9', 'J', 'Y', 'z', '6', '7', '8', '_', 'K', 'V', '=', '*', '$', '@', '&', '>', 'Q', '<', ';', 'Z', '’', 'X', '#', '+', '%', '[', ']', '“', '”', '|', '~', '`', '‘', '–', '—', '^', '…', '·', '{', '}', 'é', '£', '\xad', '³', 'Ã', 'á', 'ç'], 'stoi': defaultdict(<function _default_unk_index at 0x1143a20d0>, {'<unk>': 0, '<pad>': 1, '<bos>': 2, '<eos>': 3, 'e': 4, 't': 5, 'a': 6, 'o': 7, 'n': 8, 'i': 9, 's': 10, 'r': 11, 'h': 12, 'l': 13, 'd': 14, 'u': 15, 'c': 16, 'm': 17, 'y': 18, 'f': 19, 'g': 20, 'w': 21, 'p': 22, '.': 23, 'b': 24, 'v': 25, ',': 26, 'k': 27, 'I': 28, '-': 29, 'T': 30, 'A': 31, '0': 32, 'S': 33, "'": 34, 'C

In [194]:
print(WORD.vocab.itos[:10])

['<unk>', '<pad>', '<bos>', '<eos>', '.', 'the', ',', 'to', 'and', 'a']


In [195]:

print(UD_TAG.vocab.stoi)

defaultdict(<function _default_unk_index at 0x1143a20d0>, {'<bos>': 0, '<eos>': 1, 'NOUN': 2, 'PUNCT': 3, 'VERB': 4, 'PRON': 5, 'ADP': 6, 'DET': 7, 'PROPN': 8, 'ADJ': 9, 'AUX': 10, 'ADV': 11, 'CCONJ': 12, 'PART': 13, 'NUM': 14, 'SCONJ': 15, 'X': 16, 'INTJ': 17, 'SYM': 18})


In [315]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    repeat=False,
    device=-1)

In [316]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 300

In [343]:
import torch.optim as optim

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(WORD.vocab), len(UD_TAG.vocab))
loss_function = nn.CrossEntropyLoss(ignore_index=WORD.vocab.stoi['<pad>'])
optimizer = optim.SGD(model.parameters(), lr=0.1)

model = model.to(device)
loss_function = loss_function.to(device)

In [264]:
def sequence_accuracy(scores, targets, lengths):
    _, predict = torch.max(scores,1)
    batch_len = lengths.max()
    mask = torch.Tensor([[0] + [1 for i in range(l-2)] + [0 for i in range(batch_len - l + 1)] for l in lengths])
    mask = mask.permute(1, 0).reshape(-1)
    total = sum(mask)
    correct = predict == targets
    correct = correct.type(torch.FloatTensor)
    masked_correct = mask * correct
    acc = masked_correct.sum() / total
    # print(predict)
    # print(targets)
    # print(total, correct)
    return acc

In [344]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
  
    for i, batch in enumerate(iterator):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        optimizer.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        # model.hidden = model.init_hidden()

        # Step 2. Run our forward pass.
        inputs, lengths = batch.word
        predictions = model(inputs)
        predictions = predictions.reshape(-1, predictions.size()[-1])
        labels = batch.udtag.reshape(-1)

        # Step 3. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = criterion(predictions, labels)
        acc = sequence_accuracy(predictions, labels, lengths)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        # if i % 10 == 0:
        #     print(f'| Batch: {i:02} | Batch Loss: {loss:.3f} | Batch Acc: {acc*100:.2f}%')
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [345]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            
            # model.hidden = model.init_hidden()
            inputs, lengths = batch.word
            predictions = model(inputs)
            predictions = predictions.reshape(-1, predictions.size()[-1])
            labels = batch.udtag.reshape(-1)
            # print(predictions.size(), labels.size())
            
            loss = criterion(predictions, labels)
            acc = sequence_accuracy(predictions, labels, lengths)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [346]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, loss_function)
    valid_loss, valid_acc = evaluate(model, valid_iterator, loss_function)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

  return Variable(arr, volatile=not train), lengths
  return Variable(arr, volatile=not train)


| Epoch: 01 | Train Loss: 2.071 | Train Acc: 34.63% | Val. Loss: 1.540 | Val. Acc: 45.63% |
| Epoch: 02 | Train Loss: 1.443 | Train Acc: 53.80% | Val. Loss: 1.220 | Val. Acc: 56.02% |
| Epoch: 03 | Train Loss: 1.190 | Train Acc: 60.55% | Val. Loss: 1.060 | Val. Acc: 59.99% |
| Epoch: 04 | Train Loss: 1.048 | Train Acc: 64.09% | Val. Loss: 0.958 | Val. Acc: 63.22% |
| Epoch: 05 | Train Loss: 0.943 | Train Acc: 67.03% | Val. Loss: 0.885 | Val. Acc: 65.59% |
