In [11]:
import pyconll #pip3 install this if you don't have it
import torchtext.data as tt
import torch 
import torch.nn as nn
import torch.optim as optim
import time
import torchtext

In [12]:
AFRIKAANS_TRAIN = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu'
AFRIKAANS_DEV = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu'
AFRIKAANS_TEST = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu'

DUTCH_TRAIN = "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
DUTCH_DEV = "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
DUTCH_TEST = "UD_Dutch-Alpino/nl_alpino-ud-test.conllu"

In [13]:
# from https://github.com/soutsios/pos-tagger-bert/blob/master/pos_tagger_bert.ipynb
def make_sentences(path):
    data = pyconll.load_from_file(path)
    sentences = []
    tagged_sentences = []
    for each in data:
        tagged_sentence=[]
        sentence = []
        for token in each:
            if token.upos and token.form:
                tagged_sentence.append(token.upos)
                sentence.append(token.form.lower())
        tagged_sentences.append(tagged_sentence)
        sentences.append(sentence)
    return sentences, tagged_sentences

In [14]:
train_afr_raw, tagged_train_afr_raw = make_sentences(AFRIKAANS_TRAIN)
dev_afr_raw, tagged_dev_afr_raw = make_sentences(AFRIKAANS_DEV)
test_afr_raw, tagged_test_afr_raw = make_sentences(AFRIKAANS_TEST)

train_du_raw, tagged_train_du_raw = make_sentences(DUTCH_TRAIN)
dev_du_raw, tagged_dev_du_raw = make_sentences(DUTCH_DEV)
test_du_raw, tagged_test_du_raw = make_sentences(DUTCH_TEST)

In [15]:
print("AFRIKAANS")
print("Tagged sentences in train set: ", len(tagged_train_afr_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_afr_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_afr_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_afr_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_afr_raw)+len(tagged_dev_afr_raw)+len(tagged_test_afr_raw))

AFRIKAANS
Tagged sentences in train set:  1315
Tagged words in train set: 33894
Tagged sentences in dev set:  194
Tagged words in dev set: 5317
Tagged sentences in test set:  425
Tagged words in test set: 10065
****************************************
Total sentences in dataset: 1934


In [16]:
print("DUTCH")
print("Tagged sentences in train set: ", len(tagged_train_du_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_du_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_du_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_du_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_du_raw)+len(tagged_dev_du_raw)+len(tagged_test_du_raw))

DUTCH
Tagged sentences in train set:  12264
Tagged words in train set: 185999
Tagged sentences in dev set:  718
Tagged words in dev set: 11549
Tagged sentences in test set:  596
Tagged words in test set: 11053
****************************************
Total sentences in dataset: 13578


In [17]:
# from https://github.com/tringm/POSTagger_Pytorch/blob/master/src/util/nlp.py
def build_tag_field(sentences_tokens):
    token_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('tokens', token_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_tokens]
    torch_dataset = tt.Dataset(examples, fields)
    return token_field
    
def build_text_field(sentences_words):
    text_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('text', text_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_words]
    torch_dataset = tt.Dataset(examples, fields)
    return text_field

In [18]:
#fields, AFR
train_afr = build_text_field(train_afr_raw)
dev_afr = build_text_field(dev_afr_raw)
test_afr = build_text_field(test_afr_raw)
tagged_train_afr = build_tag_field(tagged_train_afr_raw)
tagged_dev_afr = build_tag_field(tagged_dev_afr_raw)
tagged_test_afr = build_tag_field(tagged_test_afr_raw)

fields_train_afr = (("text", train_afr), ("udtags", tagged_train_afr))
examples_train_afr = [tt.Example.fromlist(item, fields_train_afr) for item in zip(train_afr_raw, tagged_train_afr_raw)]
fields_dev_afr = (("text", dev_afr), ("udtags", tagged_dev_afr))
examples_dev_afr = [tt.Example.fromlist(item, fields_dev_afr) for item in zip(dev_afr_raw, tagged_dev_afr_raw)]
fields_test_afr = (("text", test_afr), ("udtags", tagged_test_afr))
examples_test_afr = [tt.Example.fromlist(item, fields_test_afr) for item in zip(test_afr_raw, tagged_test_afr_raw)]

train_data_afr = tt.Dataset(examples_train_afr, fields_train_afr)
valid_data_afr = tt.Dataset(examples_dev_afr, fields_dev_afr)
test_data_afr = tt.Dataset(examples_test_afr, fields_test_afr)

#build vocabs so that they are shared between splits
train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
#train_afr.vocab = af_vec
dev_afr.vocab = train_afr.vocab
test_afr.vocab = train_afr.vocab
tagged_train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
tagged_dev_afr.vocab = tagged_train_afr.vocab
tagged_test_afr.vocab = tagged_train_afr.vocab

In [19]:
#fields, DUT
train_du = build_text_field(train_du_raw)
dev_du = build_text_field(dev_du_raw)
test_du = build_text_field(test_du_raw)
tagged_train_du = build_tag_field(tagged_train_du_raw)
tagged_dev_du = build_tag_field(tagged_dev_du_raw)
tagged_test_du = build_tag_field(tagged_test_du_raw)

fields_train_du = (("text", train_du), ("udtags", tagged_train_du))
examples_train_du = [tt.Example.fromlist(item, fields_train_du) for item in zip(train_du_raw, tagged_train_du_raw)]
fields_dev_du = (("text", dev_du), ("udtags", tagged_dev_du))
examples_dev_du = [tt.Example.fromlist(item, fields_dev_du) for item in zip(dev_du_raw, tagged_dev_du_raw)]
fields_test_du = (("text", test_du), ("udtags", tagged_test_du))
examples_test_du = [tt.Example.fromlist(item, fields_test_du) for item in zip(test_du_raw, tagged_test_du_raw)]

train_data_du = tt.Dataset(examples_train_du, fields_train_du)
valid_data_du = tt.Dataset(examples_dev_du, fields_dev_du)
test_data_du = tt.Dataset(examples_test_du, fields_test_du)

#build vocabs so that they are shared between splits
train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
#train_du.vocab = nl_vec
dev_du.vocab = train_du.vocab
test_du.vocab = train_du.vocab
tagged_train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
tagged_dev_du.vocab = tagged_train_du.vocab
tagged_test_du.vocab = tagged_train_du.vocab

In [20]:
# from https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
#model
batch_size=128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#needs to be tuple of dataset objects
train_iterator, valid_iterator, test_iterator = tt.BucketIterator.splits(
    (train_data_du, valid_data_du, test_data_du), 
    batch_size = batch_size,
    device = device, sort=False)

In [21]:
# try without dropout first
class BiLSTMTagger(nn.Module):
    #https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        #fully connected layer
        self.fc = nn.Linear((hidden_dim * 2 if bidirectional else hidden_dim), output_dim)
     
    
    def forward(self, text):
        embedded = self.embedding(text)
        outputs, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(outputs)
        return predictions

In [22]:
in_dim = len(train_du.vocab)
emb_dim = 100
hid_dim = 128
out_dim = len(tagged_train_du.vocab)
n_layers = 1
bidirectional = True
pad_index = train_du.vocab.stoi[train_du.pad_token]
tag_pad_idx = tagged_train_du.vocab.stoi[tagged_train_du.pad_token]

In [23]:
model = BiLSTMTagger(in_dim, emb_dim, hid_dim, out_dim, n_layers, bidirectional, pad_index)
criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
optimizer = optim.Adam(model.parameters())

In [24]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [25]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        text = batch.text
        tags = batch.udtags
        
        optimizer.zero_grad()       
        predictions = model(text)        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        loss = criterion(predictions, tags) 
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [26]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.udtags
            
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 10

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 34s
	Train Loss: 1.442 | Train Acc: 60.19%
	 Val. Loss: 0.874 |  Val. Acc: 71.62%
Epoch: 02 | Epoch Time: 0m 34s
	Train Loss: 0.639 | Train Acc: 79.46%
	 Val. Loss: 0.666 |  Val. Acc: 78.68%
Epoch: 03 | Epoch Time: 0m 32s
	Train Loss: 0.479 | Train Acc: 84.53%
	 Val. Loss: 0.572 |  Val. Acc: 81.58%
Epoch: 04 | Epoch Time: 0m 46s
	Train Loss: 0.385 | Train Acc: 87.71%
	 Val. Loss: 0.529 |  Val. Acc: 82.94%
Epoch: 05 | Epoch Time: 0m 53s
	Train Loss: 0.317 | Train Acc: 90.02%
	 Val. Loss: 0.495 |  Val. Acc: 83.99%
Epoch: 06 | Epoch Time: 0m 55s
	Train Loss: 0.264 | Train Acc: 91.78%
	 Val. Loss: 0.468 |  Val. Acc: 85.02%
Epoch: 07 | Epoch Time: 0m 50s
	Train Loss: 0.221 | Train Acc: 93.26%
	 Val. Loss: 0.468 |  Val. Acc: 85.50%
Epoch: 08 | Epoch Time: 0m 43s
	Train Loss: 0.185 | Train Acc: 94.52%
	 Val. Loss: 0.455 |  Val. Acc: 86.09%
Epoch: 09 | Epoch Time: 0m 42s
	Train Loss: 0.153 | Train Acc: 95.58%
	 Val. Loss: 0.454 |  Val. Acc: 86.54%
Epoch: 10 | Epoch T

In [29]:
test_loss, test_acc = evaluate(model, test_iterator, criterion, tag_pad_idx)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.560 |  Test Acc: 85.03%


In [30]:
print(list(model.named_parameters()))

[('embedding.weight', Parameter containing:
tensor([[-1.5510,  0.2902,  1.4410,  ...,  1.4271,  1.8291, -1.3389],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7725,  0.3579,  0.5922,  ...,  1.1290, -0.7719,  0.6018],
        ...,
        [-2.2577, -0.9497, -1.0379,  ..., -1.0123,  2.0057,  0.1253],
        [ 0.5489,  0.5219,  1.7291,  ...,  0.6690,  0.8639,  0.4860],
        [ 0.5249, -0.1805,  0.4028,  ...,  0.1350, -0.1444, -0.7871]],
       requires_grad=True)), ('lstm.weight_ih_l0', Parameter containing:
tensor([[-0.0849, -0.2312,  0.0095,  ..., -0.1276,  0.0494,  0.1115],
        [-0.0313, -0.1294,  0.1299,  ..., -0.0491, -0.0870,  0.1488],
        [ 0.0720, -0.1329,  0.1901,  ..., -0.0552,  0.0060,  0.0280],
        ...,
        [ 0.0497,  0.2230,  0.1945,  ...,  0.0044, -0.1269, -0.1645],
        [ 0.1972,  0.0509, -0.1077,  ...,  0.2021, -0.0643,  0.1017],
        [ 0.0501,  0.0225,  0.0436,  ...,  0.0241, -0.0547, -0.0349]],
       requires

In [31]:
#TRANSFER
train_iterator, valid_iterator, test_iterator = tt.BucketIterator.splits(
    (train_data_afr, valid_data_afr, test_data_afr), 
    batch_size = batch_size,
    device = device, sort=False)

In [32]:
in_dim = len(train_afr.vocab)
emb_dim = 100
hid_dim = 128
out_dim = len(tagged_train_afr.vocab)
n_layers = 1
bidirectional = True
pad_index = train_afr.vocab.stoi[train_afr.pad_token]
tag_pad_idx = tagged_train_afr.vocab.stoi[tagged_train_afr.pad_token]

In [33]:
model2 = BiLSTMTagger(in_dim, emb_dim, hid_dim, out_dim, n_layers, bidirectional, pad_index)
criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
optimizer = optim.Adam(model2.parameters())

In [34]:
#populate dutch params in dict
transfer_param_dict = {}
params = model.named_parameters()
for name, param in params:
    transfer_param_dict[name] = param.data

In [35]:
print(transfer_param_dict.keys())

dict_keys(['embedding.weight', 'lstm.weight_ih_l0', 'lstm.weight_hh_l0', 'lstm.bias_ih_l0', 'lstm.bias_hh_l0', 'lstm.weight_ih_l0_reverse', 'lstm.weight_hh_l0_reverse', 'lstm.bias_ih_l0_reverse', 'lstm.bias_hh_l0_reverse', 'fc.weight', 'fc.bias'])


In [36]:
# trying with and without fully connecected weights. No embedding weights however
params2 = model2.named_parameters()
for name, param in params2:
    if(name == "embedding.weight" or name == "fc.weight" or name == "fc.bias"):
    #if(name == "embedding.weight"):
        continue
    else:
        param.data = transfer_param_dict[name]

In [37]:
N_EPOCHS = 10

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model2, train_iterator, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = evaluate(model2, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 6s
	Train Loss: 2.449 | Train Acc: 30.22%
	 Val. Loss: 2.111 |  Val. Acc: 42.41%
Epoch: 02 | Epoch Time: 0m 6s
	Train Loss: 1.771 | Train Acc: 56.95%
	 Val. Loss: 1.661 |  Val. Acc: 60.65%
Epoch: 03 | Epoch Time: 0m 6s
	Train Loss: 1.372 | Train Acc: 66.81%
	 Val. Loss: 1.365 |  Val. Acc: 63.73%
Epoch: 04 | Epoch Time: 0m 6s
	Train Loss: 1.110 | Train Acc: 71.11%
	 Val. Loss: 1.154 |  Val. Acc: 68.46%
Epoch: 05 | Epoch Time: 0m 6s
	Train Loss: 0.929 | Train Acc: 75.19%
	 Val. Loss: 0.997 |  Val. Acc: 72.12%
Epoch: 06 | Epoch Time: 0m 6s
	Train Loss: 0.795 | Train Acc: 78.47%
	 Val. Loss: 0.885 |  Val. Acc: 75.30%
Epoch: 07 | Epoch Time: 0m 6s
	Train Loss: 0.685 | Train Acc: 81.43%
	 Val. Loss: 0.801 |  Val. Acc: 77.42%
Epoch: 08 | Epoch Time: 0m 6s
	Train Loss: 0.607 | Train Acc: 83.71%
	 Val. Loss: 0.735 |  Val. Acc: 78.73%
Epoch: 09 | Epoch Time: 0m 6s
	Train Loss: 0.536 | Train Acc: 85.73%
	 Val. Loss: 0.685 |  Val. Acc: 79.94%
Epoch: 10 | Epoch Time: 0m 6

In [38]:
test_loss, test_acc = evaluate(model2, test_iterator, criterion, tag_pad_idx)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.600 |  Test Acc: 82.53%
