In [1]:
import torch
from torchtext import data

import torch.nn as nn
import torch.nn.functional as F

from torch.nn.utils.rnn import pack_padded_sequence

SEED = 1234

torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True


<torch._C.Generator at 0x7f5bc4096ab0>

In [4]:
class SequenceTaggingDataset(data.Dataset):
    """Defines a dataset for sequence tagging. Examples in this dataset
    contain paired lists -- paired list of words and tags.
    For example, in the case of part-of-speech tagging, an example is of the
    form
    [I, love, PyTorch, .] paired with [PRON, VERB, PROPN, PUNCT]
    See torchtext/test/sequence_tagging.py on how to use this class.
    """

    @staticmethod
    def sort_key(example):
        for attr in dir(example):
            if not callable(getattr(example, attr)) and \
                    not attr.startswith("__"):
                return len(getattr(example, attr))
        return 0

    def __init__(self, path, fields, separator="\t", **kwargs):
        examples = []
        columns = []
        print('path:', path)
        with open(path) as input_file:
            for line in input_file:
                line = line.strip()
                if line.startswith('#'):
                    continue
                elif line == "":
                    if columns:
                        examples.append(data.Example.fromlist(columns, fields))
                    columns = []
                else:
                    for i, column in enumerate(line.split(separator)):
                        if len(columns) < i + 1:
                            columns.append([])
                        columns[i].append(column)

            if columns:
                examples.append(data.Example.fromlist(columns, fields))
        super(SequenceTaggingDataset, self).__init__(examples, fields,
                                                     **kwargs)

In [5]:

WORD = data.Field(init_token="<bos>", eos_token="<eos>", include_lengths=True, lower=True, batch_first=True)
UD_TAG = data.Field(init_token="<bos>", eos_token="<eos>", pad_token=None, unk_token=None, batch_first=True)

CHAR_NESTING = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
CHAR = data.NestedField(CHAR_NESTING, init_token="<bos>", eos_token="<eos>", include_lengths=True)

In [6]:
class UDPOSMorph(SequenceTaggingDataset):

    # Universal Dependencies English Web Treebank.
    # Download original at http://universaldependencies.org/
    # License: http://creativecommons.org/licenses/by-sa/4.0/
    # urls = ['https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip']
    # dirname = 'en-ud-v2'

    @classmethod
    def splits(cls, fields, root='data', train=None, validation=None,
               test=None, **kwargs):
        cls.name = ""
        cls.dirname = ""

        return super(UDPOSMorph, cls).splits(
            fields=fields, root=root, train=train, validation=validation,
            test=test, **kwargs)

In [7]:
train_data, valid_data, test_data = UDPOSMorph.splits(
    fields=((None, None), (('word', 'char'), (WORD, CHAR)), (None, None), ('udtag', UD_TAG)),
    root='data', train='et-ud-train.conllu', validation='et-ud-dev.conllu', test='et-ud-test.conllu')

path: data/et-ud-train.conllu
path: data/et-ud-dev.conllu
path: data/et-ud-test.conllu


In [8]:
train_data, valid_data, test_data = UDPOSMorph.splits(
    fields=((None, None), ('char', CHAR), (None, None), ('udtag', UD_TAG)),
    root='data', train='et-ud-train.conllu', validation='et-ud-dev.conllu', test='et-ud-test.conllu')

path: data/et-ud-train.conllu
path: data/et-ud-dev.conllu
path: data/et-ud-test.conllu


In [9]:
print(train_data[0].__dict__)

{'char': [['Ö', 'ö'], ['o', 'l', 'i'], ['t', 'ä', 'i', 'e', 's', 't', 'i'], ['t', 'u', 'u', 'l', 'e', 't', 'u'], ['.']], 'udtag': ['NOUN', 'VERB', 'ADV', 'ADJ', 'PUNCT']}


In [20]:
from torchtext import datasets

train_data, valid_data, test_data = datasets.UDPOS.splits(
    fields=((('word', 'char'), (WORD, CHAR)), ('udtag', UD_TAG)))

In [10]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 14510
Number of validation examples: 1793
Number of testing examples: 1806


In [22]:
from torchtext import vocab
import os

glove_embeds = vocab.GloVe(name="6B", dim=300, cache="/Users/kairit/Projects/textcnn/notebooks/.vector_cache")


In [8]:
from torchtext.vocab import FastText
fasttext_embeds = FastText(language='et')

  0%|          | 0/329988 [00:00<?, ?it/s]Skipping token 329987 with 1-dimensional vector ['300']; likely a header
100%|██████████| 329988/329988 [02:15<00:00, 2437.15it/s]


In [11]:
# WORD.build_vocab(train_data)
# WORD.vocab.extend(fasttext_embeds)
# WORD.vocab.load_vectors(fasttext_embeds)

CHAR.build_vocab(train_data)
UD_TAG.build_vocab(train_data)

In [14]:
WORD.vocab.vectors.size()

torch.Size([343685, 300])

In [13]:
# print(f"Unique tokens in WORD vocabulary: {len(WORD.vocab)}")
print(f"Unique tokens in CHAR vocabulary: {len(CHAR.vocab)}")
print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAG.vocab)}")

Unique tokens in CHAR vocabulary: 120
Unique tokens in UD_TAG vocabulary: 17


In [14]:
class CharEmbeddings(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(CharEmbeddings, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.char_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embed_dropout = nn.Dropout(p=0.5)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        
    def forward(self, chars, lengths):
        chars_size = chars.size()
        
        # Aggregate sequence length and batch dimensions
        chars = chars.reshape(-1, chars_size[-1])
        
        # Embed characters
        embeds = self.char_embeddings(chars)
        embeds = self.embed_dropout(embeds)
        # Move batch to second dimension
        
        # Concat the batch and sentence dimensions of word lengths
        lengths = lengths.reshape(-1)
        
        # Sort lengths in descending order
        lengths_sort, idx_sort = torch.sort(lengths, dim=0, descending=True)
        # Replace 0 entries with 1s
        lengths_sort[lengths_sort==0] = 1
        _, idx_unsort = torch.sort(idx_sort, dim=0)
    
        # Sort embeddings
        embeds_sort = embeds.index_select(0, idx_sort)
        # Pack sorted embeddings
        embeds_pack = pack_padded_sequence(embeds_sort, lengths_sort, batch_first=True)
        
        # Send the pack through LSTM
        _, hidden = self.lstm(embeds_pack)
        
        # Concatenate states to get word embeddings
        word_embeds = torch.cat(hidden, dim=2)
        # Move directionality dimension to second position
        word_embeds = word_embeds.permute(1, 0, 2)
        # Reshape to (batch x sequence) x dimension
        word_embeds = word_embeds.reshape(-1, self.hidden_dim * 4)
        
        # Restore the original index ordering
        word_embeds = word_embeds.index_select(0, idx_unsort)
        
        # Reshape back to original shape
        word_embeds = word_embeds.reshape(chars_size[0], chars_size[1], -1)
        # Permute axes to sequence x batch x dimension
        # word_embeds = word_embeds.permute(1, 0, 2)
        return word_embeds

In [13]:


class LSTMTagger(nn.Module):

    def __init__(self, word_embedding_dim, embedding_dim, hidden_dim, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.special_embeddings = nn.Embedding(5, word_embedding_dim, padding_idx=-1)
        # self.word_embeddings.weight.requires_grad=False
        
        self.embed2input = nn.Linear(word_embedding_dim, word_embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(2*hidden_dim, tagset_size)

    def forward(self, word_embeds, words, lengths, char_embeds=None):
        # embeds = self.word_embeddings(sentence)
        
        mask = ~((words == WORD.vocab.stoi['<bos>']) | (words == WORD.vocab.stoi['<eos>']) 
                | (words == WORD.vocab.stoi['<unk>']) | (words == WORD.vocab.stoi['<pad>']))
        words[mask] = 4
        
        special_embeds = self.special_embeddings(words)
        # print(special_embeds)
        
        word_embeds = word_embeds + special_embeds
        
        embeds = self.embed2input(word_embeds)
        if char_embeds is not None:
            embeds = torch.cat([embeds, char_embeds], dim=2)
        
        lengths = lengths.reshape(-1)
        embeds_pack = pack_padded_sequence(embeds, lengths, batch_first=True)
        lstm_pack_out, _ = model.lstm(embeds_pack)

        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_pack_out, batch_first=True)
        tag_space = self.hidden2tag(lstm_out)
        return tag_space

In [15]:
BATCH_SIZE = 64
WORD_EMBEDDING_DIM = 300
HIDDEN_DIM = 300
CHAR_EMBEDDING_DIM = 100
CHAR_HIDDEN_DIM = 100
EMBEDDING_DIM = WORD_EMBEDDING_DIM + 4 * CHAR_EMBEDDING_DIM

In [16]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    repeat=False,
    device=-1)

In [17]:
import torch.optim as optim

char_model = CharEmbeddings(CHAR_EMBEDDING_DIM, CHAR_HIDDEN_DIM, len(CHAR.vocab))

model = LSTMTagger(WORD_EMBEDDING_DIM, EMBEDDING_DIM, HIDDEN_DIM, len(UD_TAG.vocab))
# pretrained_embeddings = WORD.vocab.vectors
# model.word_embeddings.weight.data.copy_(pretrained_embeddings)

loss_function = nn.CrossEntropyLoss(ignore_index=WORD.vocab.stoi['<pad>'])

params = list(char_model.parameters()) + list(model.parameters())
# optimizer = optim.SGD(params, lr=0.1)
optimizer = optim.Adam(params)

char_model = char_model.to(device)
model = model.to(device)
loss_function = loss_function.to(device)

In [18]:
def train(model, char_model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    char_model.train()
    model.train()

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()

        words, lengths = batch.word
        chars, _, char_lengths = batch.char
        char_embeddings = char_model(chars, char_lengths)
        # print('char embeddings:', char_embeddings.size())
        
        word_embeddings = F.embedding(words, WORD.vocab.vectors)
        predictions = model(word_embeddings, words, lengths, char_embeddings)
        predictions = predictions.reshape(-1, predictions.size()[-1])
        labels = batch.udtag.reshape(-1)
        words = words.reshape(-1)

        # Step 3. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = criterion(predictions, labels)
        acc = sequence_accuracy(words, predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

        if i % 10 == 0:
            print(f'| Batch: {i:02} | Batch Loss: {loss:.3f} | Batch Acc: {acc*100:.2f}%')
            # print(model.bos_embed)

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [19]:
def evaluate(model, char_model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_oov_acc = 0
    oov_batches = 0
    
    model.eval()
    char_model.eval()
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            
            words, lengths = batch.word
            chars, _, char_lengths = batch.char
            char_embeddings = char_model(chars, char_lengths)
            
            word_embeddings = F.embedding(words, WORD.vocab.vectors)
            predictions = model(word_embeddings, words, lengths, char_embeddings)
            predictions = predictions.reshape(-1, predictions.size()[-1])
            labels = batch.udtag.reshape(-1)
            words = words.reshape(-1)
            
            loss = criterion(predictions, labels)
            acc = sequence_accuracy(words, predictions, labels)
    
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            oov_acc = oov_accuracy(words, predictions, labels)
            if oov_acc is not None:
                epoch_oov_acc += oov_acc
                oov_batches += 1
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_oov_acc / oov_batches

In [20]:
import numpy as np

def oov_accuracy(words, scores, targets):
    _, predictions = torch.max(scores, 1)

    data = [[pred, lab] for (word, pred, lab) in zip(words, predictions, targets)]

    if len(data) == 0:
        return None
    else:
        data = np.array(data)
        return np.mean(data[:,0] == data[:,1])

            
            

In [23]:
N_EPOCHS = 1

for epoch in range(N_EPOCHS):

    train_loss, train_acc  = train(model, char_model, train_iterator, optimizer, loss_function)
    valid_loss, valid_acc, oov_acc = evaluate(model, char_model, valid_iterator, loss_function)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% | Val. OOV Acc: {oov_acc*100:.2f}% |')

| Batch: 00 | Batch Loss: 2.831 | Batch Acc: 7.08%
| Batch: 10 | Batch Loss: 1.584 | Batch Acc: 40.18%
| Batch: 20 | Batch Loss: 1.143 | Batch Acc: 56.36%
| Batch: 30 | Batch Loss: 1.185 | Batch Acc: 60.40%
| Batch: 40 | Batch Loss: 0.936 | Batch Acc: 71.65%
| Batch: 50 | Batch Loss: 0.683 | Batch Acc: 76.22%
| Batch: 60 | Batch Loss: 0.550 | Batch Acc: 80.56%
| Batch: 70 | Batch Loss: 0.295 | Batch Acc: 89.58%
| Batch: 80 | Batch Loss: 0.428 | Batch Acc: 86.05%
| Batch: 90 | Batch Loss: 0.395 | Batch Acc: 85.34%
| Batch: 100 | Batch Loss: 0.444 | Batch Acc: 86.19%
| Batch: 110 | Batch Loss: 0.277 | Batch Acc: 90.23%
| Batch: 120 | Batch Loss: 0.369 | Batch Acc: 89.09%
| Batch: 130 | Batch Loss: 0.276 | Batch Acc: 91.29%
| Batch: 140 | Batch Loss: 0.366 | Batch Acc: 89.36%
| Batch: 150 | Batch Loss: 0.225 | Batch Acc: 91.52%
| Batch: 160 | Batch Loss: 0.280 | Batch Acc: 87.97%
| Batch: 170 | Batch Loss: 0.283 | Batch Acc: 89.79%
| Batch: 180 | Batch Loss: 0.245 | Batch Acc: 91.02%
| Ba

  return Variable(arr, volatile=not train)
  return Variable(arr, volatile=not train), lengths


| Epoch: 01 | Train Loss: 0.587 | Train Acc: 81.11% | Val. Loss: 0.525 | Val. Acc: 88.64% | Val. OOV Acc: 82.72% |


In [107]:
train_loss, train_acc, train_oov_acc = evaluate(model, char_model, train_iterator, loss_function)
print(train_loss, train_acc, train_oov_acc)
valid_loss, valid_acc, oov_acc = evaluate(model, char_model, valid_iterator, loss_function)
print(valid_loss, valid_acc, oov_acc)

0.13253304263462826 0.9542531751248301 0.872427053548


  return Variable(arr, volatile=not train)
  return Variable(arr, volatile=not train), lengths


0.1756999532226473 0.9302465580403805 0.845655787727


In [108]:
item = next(train_iterator.__iter__())
print(item)


[torchtext.data.batch.Batch of size 64 from UDPOS]
	[.udtag]:[torch.LongTensor of size 64x15]
	[.word]:('[torch.LongTensor of size 64x15]', '[torch.LongTensor of size 64]')
	[.char]:('[torch.LongTensor of size 64x15x17]', '[torch.LongTensor of size 64]', '[torch.LongTensor of size 64x15]')


In [164]:
words, lengths = item.word
words_size = words.size()
print('words:', words_size, words)

bos_embed = torch.tensor([1.1] * 300)
eos_embed = torch.tensor([2.2] * 300)
unk_embed = torch.tensor([3.3] * 300)

mask = ~((words == WORD.vocab.stoi['<bos>']) | (words == WORD.vocab.stoi['<eos>']) | (words == WORD.vocab.stoi['<unk>']))
print('mask:', mask.size(), mask)

eos_mask = words == WORD.vocab.stoi['<eos>']
unk_mask = words == WORD.vocab.stoi['<unk>']

# Embed words
embeds = F.embedding(words, WORD.vocab.vectors)

print('word embeds:', embeds.size())
print('bos embeds:', embeds[bos_mask].size(), embeds[bos_mask])

embeds[bos_mask] = bos_embed
print('bos embeds:', embeds[bos_mask].size(), embeds[bos_mask])

embeds[eos_mask] = eos_embed
print('eos embeds:', embeds[eos_mask].size(), embeds[eos_mask])

if unk_mask.sum() > 0:
    embeds[unk_mask] = unk_embed
    print('unk embeds:', embeds[unk_mask].size(), embeds[unk_mask])



words: torch.Size([64, 15]) tensor([[     2,     14,     57,    687,     15,     14,    254,    110,
              7,    686,      9,   8387,    784,      5,      3],
        [     2,     51,     26,    131,    988,   1918,   3702,      4,
           4058,   2026,     23,   1664,   1062,      5,      3],
        [     2,     11,   2848,     16,     67,     66,     16,     22,
           1403,      8,     22,    539,   9354,      5,      3],
        [     2,      4,    193,    136,      6,      4,    682,    930,
           1159,      4,     90,   1108,    439,      5,      3],
        [     2,  10226,     21,      4,    528,   1025,      6,  14137,
              8,      4,     81,   3095,  12274,      5,      3],
        [     2,    311,    395,     13,     45,    292,      6,     40,
             14,     26,    237,   2520,    876,      5,      3],
        [     2,    131,    501,   6860,     26,    452,      7,    391,
            302,   4071,     17,    494,   1611,      5,      3],

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence
word_embeddings = F.embedding(words, WORD.vocab.vectors)
print(word_embeddings.size())
packed_words = pack_padded_sequence(word_embeddings, lengths, batch_first=True)
print(packed_words)

In [137]:
vocab = WORD.vocab
print('<bos>:', vocab.stoi['<bos>'])
print('<eos>:', vocab.stoi['<eos>'])
print('<unk>:', vocab.stoi['<unk>'])
print('<pad>:', vocab.stoi['<pad>'])

<bos>: 2
<eos>: 3
<unk>: 0
<pad>: 1


In [None]:
WORD.vocab.extend(glove_embeds)

In [None]:
WORD.vocab.load_vectors(glove_embeds)

In [None]:
class ExtendedVocab(Vocab):
    

In [None]:
from torchtext.data import Field

class SpecialisedField(Field):
    
    vocab_cls = ExtendedVocab
    
    def __init__(**kwargs):
        super(SpecialisedField, self).__init__(**kwargs)
        