In [1]:
import pyconll #pip3 install this if you don't have it
import torchtext.data as tt
import torch 
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np

In [2]:
AFRIKAANS_TRAIN = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu'
AFRIKAANS_DEV = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu'
AFRIKAANS_TEST = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu'

DUTCH_TRAIN = "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
DUTCH_DEV = "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
DUTCH_TEST = "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"

In [3]:
du_embed = {}
with open("wiki.multi.nl.vec") as f:
    for line in f:
        tokens = line.split()
        word = tokens[0]
        vector = list(map(float, tokens[1:]))
        tensor_vector = torch.FloatTensor(vector)
        du_embed[word] = tensor_vector
embedding_dim = len(du_embed['.'])

In [4]:
# from https://github.com/soutsios/pos-tagger-bert/blob/master/pos_tagger_bert.ipynb
def make_sentences(path):
    data = pyconll.load_from_file(path)
    sentences = []
    tagged_sentences = []
    for each in data:
        tagged_sentence=[]
        sentence = []
        for token in each:
            if token.upos and token.form:
                tagged_sentence.append(token.upos)
                sentence.append(token.form.lower())
        tagged_sentences.append(tagged_sentence)
        sentences.append(sentence)
    return sentences, tagged_sentences

In [5]:
train_afr_raw, tagged_train_afr_raw = make_sentences(AFRIKAANS_TRAIN)
dev_afr_raw, tagged_dev_afr_raw = make_sentences(AFRIKAANS_DEV)
test_afr_raw, tagged_test_afr_raw = make_sentences(AFRIKAANS_TEST)

train_du_raw, tagged_train_du_raw = make_sentences(DUTCH_TRAIN)
dev_du_raw, tagged_dev_du_raw = make_sentences(DUTCH_DEV)
test_du_raw, tagged_test_du_raw = make_sentences(DUTCH_TEST)

In [6]:
print("AFRIKAANS")
print("Tagged sentences in train set: ", len(tagged_train_afr_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_afr_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_afr_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_afr_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_afr_raw)+len(tagged_dev_afr_raw)+len(tagged_dev_afr_raw))

AFRIKAANS
Tagged sentences in train set:  1315
Tagged words in train set: 33894
Tagged sentences in dev set:  194
Tagged words in dev set: 5317
Tagged sentences in test set:  425
Tagged words in test set: 10065
****************************************
Total sentences in dataset: 1703


In [7]:
print("DUTCH")
print("Tagged sentences in train set: ", len(tagged_train_du_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_du_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_du_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_du_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_du_raw)+len(tagged_dev_du_raw)+len(tagged_dev_du_raw))

DUTCH
Tagged sentences in train set:  12264
Tagged words in train set: 185999
Tagged sentences in dev set:  718
Tagged words in dev set: 11549
Tagged sentences in test set:  12264
Tagged words in test set: 185999
****************************************
Total sentences in dataset: 13700


In [8]:
# from https://github.com/tringm/POSTagger_Pytorch/blob/master/src/util/nlp.py
def build_tag_field(sentences_tokens):
    token_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('tokens', token_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_tokens]
    torch_dataset = tt.Dataset(examples, fields)
    return token_field
    
def build_text_field(sentences_words):
    text_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('text', text_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_words]
    torch_dataset = tt.Dataset(examples, fields)
    return text_field

In [9]:
#fields, AFR
train_afr = build_text_field(train_afr_raw)
dev_afr = build_text_field(dev_afr_raw)
test_afr = build_text_field(test_afr_raw)
tagged_train_afr = build_tag_field(tagged_train_afr_raw)
tagged_dev_afr = build_tag_field(tagged_dev_afr_raw)
tagged_test_afr = build_tag_field(tagged_test_afr_raw)

fields_train_afr = (("text", train_afr), ("udtags", tagged_train_afr))
examples_train_afr = [tt.Example.fromlist(item, fields_train_afr) for item in zip(train_afr_raw, tagged_train_afr_raw)]
fields_dev_afr = (("text", dev_afr), ("udtags", tagged_dev_afr))
examples_dev_afr = [tt.Example.fromlist(item, fields_dev_afr) for item in zip(dev_afr_raw, tagged_dev_afr_raw)]
fields_test_afr = (("text", test_afr), ("udtags", tagged_test_afr))
examples_test_afr = [tt.Example.fromlist(item, fields_test_afr) for item in zip(test_afr_raw, tagged_test_afr_raw)]

train_data_afr = tt.Dataset(examples_train_afr, fields_train_afr)
valid_data_afr = tt.Dataset(examples_dev_afr, fields_dev_afr)
test_data_afr = tt.Dataset(examples_test_afr, fields_test_afr)

#build vocabs so that they are shared between splits
train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
dev_afr.vocab = train_afr.vocab
test_afr.vocab = train_afr.vocab
tagged_train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
tagged_dev_afr.vocab = tagged_train_afr.vocab
tagged_test_afr.vocab = tagged_train_afr.vocab

In [10]:
#fields, DUT
train_du = build_text_field(train_du_raw)
dev_du = build_text_field(dev_du_raw)
test_du = build_text_field(test_du_raw)
tagged_train_du = build_tag_field(tagged_train_du_raw)
tagged_dev_du = build_tag_field(tagged_dev_du_raw)
tagged_test_du = build_tag_field(tagged_test_du_raw)

fields_train_du = (("text", train_du), ("udtags", tagged_train_du))
examples_train_du = [tt.Example.fromlist(item, fields_train_du) for item in zip(train_du_raw, tagged_train_du_raw)]
fields_dev_du = (("text", dev_du), ("udtags", tagged_dev_du))
examples_dev_du = [tt.Example.fromlist(item, fields_dev_du) for item in zip(dev_du_raw, tagged_dev_du_raw)]
fields_test_du = (("text", test_du), ("udtags", tagged_test_du))
examples_test_du = [tt.Example.fromlist(item, fields_test_du) for item in zip(test_du_raw, tagged_test_du_raw)]

train_data_du = tt.Dataset(examples_train_du, fields_train_du)
valid_data_du = tt.Dataset(examples_dev_du, fields_dev_du)
test_data_du = tt.Dataset(examples_test_du, fields_test_du)

#build vocabs so that they are shared between splits
train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
dev_du.vocab = train_du.vocab
test_du.vocab = train_du.vocab
tagged_train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
tagged_dev_du.vocab = tagged_train_du.vocab
tagged_test_du.vocab = tagged_train_du.vocab

In [32]:
matrix_len = len(train_du.vocab.itos)
weights_matrix = torch.zeros((matrix_len, embedding_dim))
words_found = 0
words_missing = 0

for i, word in enumerate(train_afr.vocab.itos):
#     print(i, word)
    try: 
        weights_matrix[i] = du_embed[word]
        words_found += 1
    except KeyError:
#         print("test")
        weights_matrix[i] = torch.empty(embedding_dim, ).normal_()
        words_missing += 1
print("Words Found: ", words_found)
print("Words Missing: ", words_missing)

Words Found:  158
Words Missing:  4519


In [12]:
def create_emb_layer(weights_matrix, non_trainable=False):
    input_dim, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(input_dim, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, input_dim, embedding_dim

In [13]:
# from https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
#model
batch_size=128
device = torch.device('cpu')

#needs to be tuple of dataset objects
train_iterator, valid_iterator, test_iterator = tt.BucketIterator.splits(
    (train_data_du, valid_data_du, test_data_du), 
    batch_size = batch_size,
    device = device, sort=False)

In [14]:
# try without dropout first
class BiLSTMTagger(nn.Module):
    #https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        #fully connected layer
        self.fc = nn.Linear((hidden_dim * 2 if bidirectional else hidden_dim), output_dim)
     
    
    def forward(self, text):
        embedded = self.embedding(text)
        outputs, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(outputs)
        return predictions

In [15]:
# try without dropout first
class BiLSTMTagger_PreEmbed(nn.Module):
    #https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
    def __init__(self, weights_matrix, hidden_dim, output_dim, n_layers, bidirectional, pad_idx):
        super().__init__()
        self.embedding, input_dim, embedding_dim = create_emb_layer(weights_matrix, True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        #fully connected layer
        self.fc = nn.Linear((hidden_dim * 2 if bidirectional else hidden_dim), output_dim)
     
    
    def forward(self, text):
        embedded = self.embedding(text)
        outputs, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(outputs)
        return predictions

In [16]:
in_dim = len(train_du.vocab)
emb_dim = 100
hid_dim = 128
out_dim = len(tagged_train_du.vocab)
n_layers = 1
bidirectional = True
pad_index = train_du.vocab.stoi[train_du.pad_token]
tag_pad_idx = tagged_train_du.vocab.stoi[tagged_train_du.pad_token]

In [17]:
model = BiLSTMTagger_PreEmbed(weights_matrix, hid_dim, out_dim, n_layers, bidirectional, pad_index)
criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
optimizer = optim.Adam(model.parameters())

In [18]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [19]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        text = batch.text
        tags = batch.udtags
        
        optimizer.zero_grad()       
        predictions = model(text)        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        loss = criterion(predictions, tags) 
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [20]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.udtags
            
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [22]:
N_EPOCHS = 10

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 33s
	Train Loss: 1.699 | Train Acc: 49.08%
	 Val. Loss: 1.303 |  Val. Acc: 60.90%
Epoch: 02 | Epoch Time: 1m 31s
	Train Loss: 0.890 | Train Acc: 73.40%
	 Val. Loss: 1.012 |  Val. Acc: 70.18%
Epoch: 03 | Epoch Time: 1m 10s
	Train Loss: 0.612 | Train Acc: 82.17%
	 Val. Loss: 0.862 |  Val. Acc: 75.02%
Epoch: 04 | Epoch Time: 1m 9s
	Train Loss: 0.464 | Train Acc: 86.62%
	 Val. Loss: 0.808 |  Val. Acc: 77.57%
Epoch: 05 | Epoch Time: 1m 8s
	Train Loss: 0.374 | Train Acc: 89.35%
	 Val. Loss: 0.773 |  Val. Acc: 79.32%
Epoch: 06 | Epoch Time: 1m 6s
	Train Loss: 0.315 | Train Acc: 91.09%
	 Val. Loss: 0.752 |  Val. Acc: 80.20%
Epoch: 07 | Epoch Time: 1m 7s
	Train Loss: 0.273 | Train Acc: 92.20%
	 Val. Loss: 0.796 |  Val. Acc: 80.17%
Epoch: 08 | Epoch Time: 1m 10s
	Train Loss: 0.241 | Train Acc: 93.10%
	 Val. Loss: 0.788 |  Val. Acc: 81.01%
Epoch: 09 | Epoch Time: 1m 7s
	Train Loss: 0.215 | Train Acc: 93.87%
	 Val. Loss: 0.784 |  Val. Acc: 81.25%
Epoch: 10 | Epoch Time: 

In [23]:
test_loss, test_acc = evaluate(model, test_iterator, criterion, tag_pad_idx)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.164 |  Test Acc: 95.52%


In [24]:
print(list(model.named_parameters()))

[('embedding.weight', Parameter containing:
tensor([[-0.9891,  0.0520,  0.2516,  ..., -0.5476, -1.3248,  0.7580],
        [-1.9092,  0.9414,  0.1942,  ...,  0.6298,  0.1553,  1.1409],
        [ 0.0358,  0.8690,  0.1499,  ...,  0.2708, -0.0618,  1.2390],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])), ('lstm.weight_ih_l0', Parameter containing:
tensor([[-0.2180, -0.0073,  0.0912,  ...,  0.0290, -0.1063,  0.0969],
        [-0.0879,  0.0067, -0.0118,  ..., -0.0657, -0.0923,  0.0284],
        [ 0.1929, -0.1973,  0.0762,  ..., -0.3319, -0.0082,  0.0360],
        ...,
        [-0.0722, -0.0550, -0.0291,  ..., -0.0740, -0.2051, -0.0015],
        [-0.0739,  0.1168, -0.1411,  ..., -0.0786,  0.0053, -0.0652],
        [ 0.2023, -0.1211,  0.1133,  ...,  0.0076,  0.1061,  0.0340]],
       requires_grad=True)), ('lstm.weight

In [25]:
#TRANSFER
train_iterator, valid_iterator, test_iterator = tt.BucketIterator.splits(
    (train_data_afr, valid_data_afr, test_data_afr), 
    batch_size = batch_size,
    device = device, sort=False)

In [26]:
in_dim = len(train_afr.vocab)
emb_dim = 100
hid_dim = 128
out_dim = len(tagged_train_afr.vocab)
n_layers = 1
bidirectional = True
pad_index = train_afr.vocab.stoi[train_afr.pad_token]
tag_pad_idx = tagged_train_afr.vocab.stoi[tagged_train_afr.pad_token]

In [27]:
model2 = BiLSTMTagger(in_dim, emb_dim, hid_dim, out_dim, n_layers, bidirectional, pad_index)
criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
optimizer = optim.Adam(model2.parameters())

In [28]:
#populate dutch params in dict
transfer_param_dict = {}
params = model.named_parameters()
for name, param in params:
    transfer_param_dict[name] = param.data

In [29]:
print(transfer_param_dict.keys())

dict_keys(['embedding.weight', 'lstm.weight_ih_l0', 'lstm.weight_hh_l0', 'lstm.bias_ih_l0', 'lstm.bias_hh_l0', 'lstm.weight_ih_l0_reverse', 'lstm.weight_hh_l0_reverse', 'lstm.bias_ih_l0_reverse', 'lstm.bias_hh_l0_reverse', 'fc.weight', 'fc.bias'])


In [30]:
#
params2 = model2.named_parameters()
for name, param in params2:
    if(name == "embedding.weight" or name == "fc.weight" or name == "fc.bias"):
        continue
    else:
        param.data = transfer_param_dict[name]

In [31]:
N_EPOCHS = 10

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model2, train_iterator, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = evaluate(model2, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: size mismatch, m1: [11520 x 100], m2: [300 x 512] at C:\w\1\s\tmp_conda_3.7_100118\conda\conda-bld\pytorch_1579082551706\work\aten\src\TH/generic/THTensorMath.cpp:136

In [None]:
test_loss, test_acc = evaluate(model2, test_iterator, criterion, tag_pad_idx)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')