In [1]:
import pyconll #pip3 install this if you don't have it
import torchtext.data as tt
import torchtext
import torch 
import torch.nn as nn
import torch.optim as optim
import time
from tools import EarlyStopping

In [2]:
af_vec = torchtext.vocab.FastText(language='af')
du_vec = torchtext.vocab.FastText(language='nl')

In [207]:
embedding_dim = 300

In [208]:
AFRIKAANS_TRAIN = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-train.conllu'
AFRIKAANS_DEV = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-dev.conllu'
AFRIKAANS_TEST = 'UD_Afrikaans-AfriBooms/af_afribooms-ud-test.conllu'

DUTCH_TRAIN = "UD_Dutch-Alpino/nl_alpino-ud-train.conllu"
DUTCH_DEV = "UD_Dutch-Alpino/nl_alpino-ud-dev.conllu"
DUTCH_TEST = "UD_Dutch-Alpino/nl_alpino-ud-test.conllu"

In [209]:
# from https://github.com/soutsios/pos-tagger-bert/blob/master/pos_tagger_bert.ipynb
def make_sentences(path):
    data = pyconll.load_from_file(path)
    sentences = []
    tagged_sentences = []
    for each in data:
        tagged_sentence=[]
        sentence = []
        for token in each:
            if token.upos and token.form:
                tagged_sentence.append(token.upos)
                sentence.append(token.form.lower())
        tagged_sentences.append(tagged_sentence)
        sentences.append(sentence)
    return sentences, tagged_sentences

In [210]:
train_afr_raw, tagged_train_afr_raw = make_sentences(AFRIKAANS_TRAIN)
dev_afr_raw, tagged_dev_afr_raw = make_sentences(AFRIKAANS_DEV)
test_afr_raw, tagged_test_afr_raw = make_sentences(AFRIKAANS_TEST)

train_du_raw, tagged_train_du_raw = make_sentences(DUTCH_TRAIN)
dev_du_raw, tagged_dev_du_raw = make_sentences(DUTCH_DEV)
test_du_raw, tagged_test_du_raw = make_sentences(DUTCH_TEST)

In [211]:
print("AFRIKAANS")
print("Tagged sentences in train set: ", len(tagged_train_afr_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_afr_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_afr_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_afr_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_afr_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_afr_raw)+len(tagged_dev_afr_raw)+len(tagged_test_afr_raw))

AFRIKAANS
Tagged sentences in train set:  1315
Tagged words in train set: 33894
Tagged sentences in dev set:  194
Tagged words in dev set: 5317
Tagged sentences in test set:  425
Tagged words in test set: 10065
****************************************
Total sentences in dataset: 1934


In [212]:
print("DUTCH")
print("Tagged sentences in train set: ", len(tagged_train_du_raw))
print("Tagged words in train set:", len([item for sublist in tagged_train_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in dev set: ", len(tagged_dev_du_raw))
print("Tagged words in dev set:", len([item for sublist in tagged_dev_du_raw for item in sublist]))
print(40*'=')
print("Tagged sentences in test set: ", len(tagged_test_du_raw))
print("Tagged words in test set:", len([item for sublist in tagged_test_du_raw for item in sublist]))
print(40*'*')
print("Total sentences in dataset:", len(tagged_train_du_raw)+len(tagged_dev_du_raw)+len(tagged_test_du_raw))

DUTCH
Tagged sentences in train set:  12264
Tagged words in train set: 185999
Tagged sentences in dev set:  718
Tagged words in dev set: 11549
Tagged sentences in test set:  596
Tagged words in test set: 11053
****************************************
Total sentences in dataset: 13578


In [213]:
# from https://github.com/tringm/POSTagger_Pytorch/blob/master/src/util/nlp.py
def build_tag_field(sentences_tokens):
    token_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('tokens', token_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_tokens]
    torch_dataset = tt.Dataset(examples, fields)
    return token_field
    
def build_text_field(sentences_words):
    text_field = tt.Field(tokenize=list, init_token="<bos>", eos_token="<eos>")
    fields = [('text', text_field)]
    examples = [tt.Example.fromlist([t], fields) for t in sentences_words]
    torch_dataset = tt.Dataset(examples, fields)
    return text_field

In [214]:
#fields, AFR
train_afr = build_text_field(train_afr_raw)
dev_afr = build_text_field(dev_afr_raw)
test_afr = build_text_field(test_afr_raw)
tagged_train_afr = build_tag_field(tagged_train_afr_raw)
tagged_dev_afr = build_tag_field(tagged_dev_afr_raw)
tagged_test_afr = build_tag_field(tagged_test_afr_raw)

fields_train_afr = (("text", train_afr), ("udtags", tagged_train_afr))
examples_train_afr = [tt.Example.fromlist(item, fields_train_afr) for item in zip(train_afr_raw, tagged_train_afr_raw)]
fields_dev_afr = (("text", dev_afr), ("udtags", tagged_dev_afr))
examples_dev_afr = [tt.Example.fromlist(item, fields_dev_afr) for item in zip(dev_afr_raw, tagged_dev_afr_raw)]
fields_test_afr = (("text", test_afr), ("udtags", tagged_test_afr))
examples_test_afr = [tt.Example.fromlist(item, fields_test_afr) for item in zip(test_afr_raw, tagged_test_afr_raw)]

train_data_afr = tt.Dataset(examples_train_afr, fields_train_afr)
valid_data_afr = tt.Dataset(examples_dev_afr, fields_dev_afr)
test_data_afr = tt.Dataset(examples_test_afr, fields_test_afr)

#build vocabs so that they are shared between splits
train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
dev_afr.vocab = train_afr.vocab
test_afr.vocab = train_afr.vocab
tagged_train_afr.build_vocab(train_data_afr, valid_data_afr, test_data_afr)
tagged_dev_afr.vocab = tagged_train_afr.vocab
tagged_test_afr.vocab = tagged_train_afr.vocab

In [215]:
#fields, DUT
train_du = build_text_field(train_du_raw)
dev_du = build_text_field(dev_du_raw)
test_du = build_text_field(test_du_raw)
tagged_train_du = build_tag_field(tagged_train_du_raw)
tagged_dev_du = build_tag_field(tagged_dev_du_raw)
tagged_test_du = build_tag_field(tagged_test_du_raw)

fields_train_du = (("text", train_du), ("udtags", tagged_train_du))
examples_train_du = [tt.Example.fromlist(item, fields_train_du) for item in zip(train_du_raw, tagged_train_du_raw)]
fields_dev_du = (("text", dev_du), ("udtags", tagged_dev_du))
examples_dev_du = [tt.Example.fromlist(item, fields_dev_du) for item in zip(dev_du_raw, tagged_dev_du_raw)]
fields_test_du = (("text", test_du), ("udtags", tagged_test_du))
examples_test_du = [tt.Example.fromlist(item, fields_test_du) for item in zip(test_du_raw, tagged_test_du_raw)]

train_data_du = tt.Dataset(examples_train_du, fields_train_du)
valid_data_du = tt.Dataset(examples_dev_du, fields_dev_du)
test_data_du = tt.Dataset(examples_test_du, fields_test_du)

#build vocabs so that they are shared between splits
train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
dev_du.vocab = train_du.vocab
test_du.vocab = train_du.vocab
tagged_train_du.build_vocab(train_data_du, valid_data_du, test_data_du)
tagged_dev_du.vocab = tagged_train_du.vocab
tagged_test_du.vocab = tagged_train_du.vocab

In [216]:
# Code adapted from https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
afr_matrix_len = len(train_afr.vocab.itos)
afr_weights_matrix = torch.zeros((afr_matrix_len, embedding_dim))
words_found = 0
words_missing = 0

for i, word in enumerate(train_afr.vocab.itos):
#     print(i, word)
    try: 
        afr_weights_matrix[i] = af_vec[word]
        words_found += 1
    except KeyError:
#         print("test")
        afr_weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))
        words_missing += 1

print("Afrikaans words missing: ", words_missing)

du_matrix_len = len(train_du.vocab.itos)
du_weights_matrix = torch.zeros((du_matrix_len, embedding_dim))
words_found = 0
words_missing = 0

for i, word in enumerate(train_afr.vocab.itos):
#     print(i, word)
    try: 
        du_weights_matrix[i] = du_vec[word]
        words_found += 1
    except KeyError:
#         print("test")
        du_weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))
        words_missing += 1

print("dutch words missing: ", words_missing)

Afrikaans words missing:  0
dutch words missing:  0


In [217]:
def create_emb_layer(weights_matrix, pad_idx, non_trainable=False):
    input_dim, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, input_dim, embedding_dim

In [218]:
# from https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
#model
batch_size=128
device = torch.device('cpu')

#needs to be tuple of dataset objects
train_iterator, valid_iterator, test_iterator = tt.BucketIterator.splits(
    (train_data_du, valid_data_du, test_data_du), 
    batch_size = batch_size,
    device = device, sort=False)

In [219]:
# try without dropout first
class BiLSTMTagger(nn.Module):
    #https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        #fully connected layer
        self.fc = nn.Linear((hidden_dim * 2 if bidirectional else hidden_dim), output_dim)
     
    
    def forward(self, text):
        embedded = self.embedding(text)
        outputs, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(outputs)
        return predictions

class BiLSTMTagger_Pretrained(nn.Module):
    #https://github.com/bentrevett/pytorch-pos-tagging/blob/master/1%20-%20BiLSTM%20for%20PoS%20Tagging.ipynb
    def __init__(self, weights_matrix, hidden_dim, output_dim, n_layers, bidirectional, pad_idx):
        super().__init__()
        self.embedding, input_dim, embedding_dim = create_emb_layer(weights_matrix, pad_idx, False)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional)
        #fully connected layer
        self.fc = nn.Linear((hidden_dim * 2 if bidirectional else hidden_dim), output_dim)
     
    
    def forward(self, text):
        embedded = self.embedding(text)
        outputs, (hidden, cell) = self.lstm(embedded)
        predictions = self.fc(outputs)
        return predictions

In [220]:
in_dim = len(train_du.vocab)
emb_dim = 100
hid_dim = 128
out_dim = len(tagged_train_du.vocab)
n_layers = 1
bidirectional = True
pad_index = train_du.vocab.stoi[train_du.pad_token]
tag_pad_idx = tagged_train_du.vocab.stoi[tagged_train_du.pad_token]

In [221]:
model = BiLSTMTagger_Pretrained(du_weights_matrix, hid_dim, out_dim, n_layers, bidirectional, pad_index)
criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
optimizer = optim.Adam(model.parameters())

In [222]:
def categorical_accuracy(preds, y, tag_pad_idx):
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [223]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        text = batch.text
        tags = batch.udtags
        
        optimizer.zero_grad()       
        predictions = model(text)        
        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)
        
        loss = criterion(predictions, tags) 
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [224]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            tags = batch.udtags
            
            predictions = model(text)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [225]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [226]:
N_EPOCHS = 50
patience = 3
early_stopping = EarlyStopping(patience=patience, verbose=False,filename='checkpt_tr_un.pt')

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    early_stopping(valid_loss, model)
    if early_stopping.early_stop:
        print("Early stopping, reloading checkpoint model")
        model.load_state_dict(torch.load('checkpt_tr_un.pt'))
        break
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 15s
	Train Loss: 1.523 | Train Acc: 57.30%
	 Val. Loss: 0.681 |  Val. Acc: 78.78%
Epoch: 02 | Epoch Time: 0m 53s
	Train Loss: 0.330 | Train Acc: 90.78%
	 Val. Loss: 0.314 |  Val. Acc: 90.40%
Epoch: 03 | Epoch Time: 0m 57s
	Train Loss: 0.137 | Train Acc: 96.31%
	 Val. Loss: 0.261 |  Val. Acc: 91.80%
Epoch: 04 | Epoch Time: 1m 9s
	Train Loss: 0.086 | Train Acc: 97.50%
	 Val. Loss: 0.243 |  Val. Acc: 92.30%
Epoch: 05 | Epoch Time: 1m 22s
	Train Loss: 0.064 | Train Acc: 98.13%
	 Val. Loss: 0.237 |  Val. Acc: 92.49%
EarlyStopping counter: 1 out of 3
Epoch: 06 | Epoch Time: 1m 50s
	Train Loss: 0.048 | Train Acc: 98.66%
	 Val. Loss: 0.248 |  Val. Acc: 92.12%
EarlyStopping counter: 2 out of 3
Epoch: 07 | Epoch Time: 1m 55s
	Train Loss: 0.036 | Train Acc: 99.02%
	 Val. Loss: 0.258 |  Val. Acc: 92.01%
EarlyStopping counter: 3 out of 3
Early stopping, reloading checkpoint model


In [227]:
test_loss, test_acc = evaluate(model, test_iterator, criterion, tag_pad_idx)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.270 |  Test Acc: 91.49%


In [228]:
print(list(model.named_parameters()))

[('embedding.weight', Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1330,  0.1126, -0.1482,  ..., -0.1375,  0.1143, -0.1882],
        ...,
        [-0.0113, -0.0080,  0.0308,  ...,  0.0014, -0.0197,  0.0279],
        [ 0.0339, -0.0339, -0.0333,  ...,  0.0249, -0.0289, -0.0306],
        [ 0.0329, -0.0310, -0.0327,  ..., -0.0167, -0.0190, -0.0318]],
       requires_grad=True)), ('lstm.weight_ih_l0', Parameter containing:
tensor([[ 8.9730e-02, -6.9807e-02,  6.7183e-05,  ..., -3.3914e-02,
         -1.5017e-01,  9.2717e-02],
        [ 1.0583e-01, -4.5381e-02, -1.0718e-01,  ..., -5.9115e-02,
         -6.5828e-02,  4.2318e-02],
        [ 4.3417e-02,  3.8162e-02,  1.0084e-01,  ..., -5.8114e-02,
         -7.6137e-02,  1.0897e-01],
        ...,
        [ 6.2515e-02,  5.4579e-02,  1.6341e-01,  ..., -8.3769e-02,
         -1.0880e-01, -1.1832e-02],
        [ 1.1698e-01,  7

In [229]:
#TRANSFER
train_iterator, valid_iterator, test_iterator = tt.BucketIterator.splits(
    (train_data_afr, valid_data_afr, test_data_afr), 
    batch_size = batch_size,
    device = device, sort=False)

In [230]:
in_dim = len(train_afr.vocab)
emb_dim = 100
hid_dim = 128
out_dim = len(tagged_train_afr.vocab)
n_layers = 1
bidirectional = True
pad_index = train_afr.vocab.stoi[train_afr.pad_token]
tag_pad_idx = tagged_train_afr.vocab.stoi[tagged_train_afr.pad_token]

In [231]:
model2 = BiLSTMTagger_Pretrained(afr_weights_matrix, hid_dim, out_dim, n_layers, bidirectional, pad_index)
criterion = nn.CrossEntropyLoss(ignore_index = tag_pad_idx)
optimizer = optim.Adam(model2.parameters())

In [232]:
#populate dutch params in dict
transfer_param_dict = {}
params = model.named_parameters()
for name, param in params:
    transfer_param_dict[name] = param.data

In [233]:
print(transfer_param_dict.keys())

dict_keys(['embedding.weight', 'lstm.weight_ih_l0', 'lstm.weight_hh_l0', 'lstm.bias_ih_l0', 'lstm.bias_hh_l0', 'lstm.weight_ih_l0_reverse', 'lstm.weight_hh_l0_reverse', 'lstm.bias_ih_l0_reverse', 'lstm.bias_hh_l0_reverse', 'fc.weight', 'fc.bias'])


In [None]:
#
params2 = model2.named_parameters()
for name, param in params2:
    if(name == "embedding.weight" or name == "fc.weight" or name == "fc.bias"):
        continue
    else:
        param.data = transfer_param_dict[name]

In [None]:
N_EPOCHS = 50
patience = 3
early_stopping = EarlyStopping(patience=patience, verbose=False,filename='checkpt_tr_un2.pt')

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model2, train_iterator, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = evaluate(model2, valid_iterator, criterion, tag_pad_idx)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
    
    early_stopping(valid_loss, model2)
    if early_stopping.early_stop:
        print("Early stopping, reloading checkpoint model")
        model2.load_state_dict(torch.load('checkpt_tr_un2.pt'))
        break
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 17s
	Train Loss: 2.510 | Train Acc: 21.95%
	 Val. Loss: 2.294 |  Val. Acc: 34.30%
Epoch: 02 | Epoch Time: 0m 16s
	Train Loss: 2.002 | Train Acc: 49.61%
	 Val. Loss: 1.871 |  Val. Acc: 57.52%


In [None]:
test_loss, test_acc = evaluate(model2, test_iterator, criterion, tag_pad_idx)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

(90.96,93.60) (91.04, 93.04) (90.89, 92.75) (90.98, 93.48),(90.85, 93.17) 2layers

1layer