## Naive tokenizer

In [1]:
import torch
from typing import List

class NaiveTokenizer:
  
    def __init__(self, base_vocabulary, unk='<unk>', pad='<pad>'):
        assert(type(base_vocabulary) == list)
        self.unk = unk
        self.pad = pad
        self.vocabulary = []
        self.types2idx  = {}
        self.add_tokens([self.unk, self.pad] + base_vocabulary)


    def add_tokens(self, tokens):
        if not type(tokens) == list:
            tokens = [tokens]

        for token in tokens:
            if token not in self.vocabulary:
                self.vocabulary.append(token)

        self.types2idx = {el: idx for idx, el in enumerate(self.vocabulary)}


    def tokenize(self, string: str):
        tokens = string.split()
        return tokens


    def convert_tokens_to_ids(self, tokens: List[str]):
        unkid = self.types2idx[self.unk]
        return [self.types2idx.get(token, unkid) for token in tokens]


    def encode(self, string):
        tokens = self.tokenize(string)
        return self.convert_tokens_to_ids(tokens)


    def decode(self, ids):
        tokens = [self.vocabulary[idx] for idx in ids]
        return " ".join(tokens)


    def __call__(self, string):
        return self.encode(string)


    @property
    def pad_id(self):
        return self.types2idx[self.pad]


    @property
    def vocab_size(self):
        return len(self.vocabulary)


    def decode_ngram(self, ngram_sequence):
        return self.decode(ngram[-1] for ngram in ngram_sequence)


    def pad_batch(self, batch_codes: List[List[int]]):
        max_len = max([len(sentence) for sentence in batch_codes])
        padded_codes = [
            sentence + [self.pad_id] * (max_len - len(sentence))
            for sentence in batch_codes
        ]
        return torch.LongTensor(padded_codes)

In [2]:
base_vocabulary = "Language models are cool ."
tokenizer = NaiveTokenizer(base_vocabulary.split())
codes = tokenizer("Language models are not so cool .")
decoded = tokenizer.decode(codes)
tokens = tokenizer.tokenize("Language models are not so cool .")
print("codes:", codes)
print("decoded:", decoded)
print("tokens:", tokens)

codes: [2, 3, 4, 0, 0, 5, 6]
decoded: Language models are <unk> <unk> cool .
tokens: ['Language', 'models', 'are', 'not', 'so', 'cool', '.']


In [3]:
sentences = ["Language models are cool .", "Language models are not so cool ."]
batch = [tokenizer(sentence) for sentence in sentences]
print("Pad batch:")
print(tokenizer.pad_batch(batch))

Pad batch:
tensor([[2, 3, 4, 5, 6, 1, 1],
        [2, 3, 4, 0, 0, 5, 6]])


## Dataloader

In [4]:
from torch.utils.data import Dataset, DataLoader

class NgramsLanguageModelDataSet(Dataset):

    def __init__(self, N: int, data: List[str], tokenizer):
        self.N = N
        self.tokenizer = tokenizer
        self.data = [self.ngramify(tokenizer(sentence)) for sentence in data]
        self.data = [ngram for sent in self.data for ngram in sent if len(ngram) == self.N]

    def ngramify(self, token_list: List[str]):
        padded_tokens = [self.tokenizer.pad_id] * (self.N - 1) + token_list
        return [padded_tokens[i:i+self.N] for i in range(len(token_list))]

    def __len__(self):
        """Get the length of the dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        """Get the corresponding example from an index."""
        return self.data[idx]

In [5]:
def normalize(sentence: str):
    return sentence.replace(".", " . ")

In [6]:
zebra_dataset = """
There are five houses.
The Englishman lives in the red house.
The Spaniard owns the dog.
Coffee is drunk in the green house.
The Ukrainian drinks tea.
The green house is immediately to the right of the ivory house.
The Old Gold smoker owns snails.
Kools are smoked in the yellow house.
Milk is drunk in the middle house.
The Norwegian lives in the first house.
The man who smokes Chesterfields lives in the house next to the man with the fox.
Kools are smoked in the house next to the house where the horse is kept.
The Lucky Strike smoker drinks orange juice.
The Japanese smokes Parliaments.
The Norwegian lives next to the blue house.
I have lost my keys, yet now I found them.
"""

In [7]:
base_vocabulary = normalize(zebra_dataset).split()
data = [normalize(sent) for sent in zebra_dataset.split('\n')]
print("Vocabulary:", base_vocabulary)
print("Dataset:", sentences)

Vocabulary: ['There', 'are', 'five', 'houses', '.', 'The', 'Englishman', 'lives', 'in', 'the', 'red', 'house', '.', 'The', 'Spaniard', 'owns', 'the', 'dog', '.', 'Coffee', 'is', 'drunk', 'in', 'the', 'green', 'house', '.', 'The', 'Ukrainian', 'drinks', 'tea', '.', 'The', 'green', 'house', 'is', 'immediately', 'to', 'the', 'right', 'of', 'the', 'ivory', 'house', '.', 'The', 'Old', 'Gold', 'smoker', 'owns', 'snails', '.', 'Kools', 'are', 'smoked', 'in', 'the', 'yellow', 'house', '.', 'Milk', 'is', 'drunk', 'in', 'the', 'middle', 'house', '.', 'The', 'Norwegian', 'lives', 'in', 'the', 'first', 'house', '.', 'The', 'man', 'who', 'smokes', 'Chesterfields', 'lives', 'in', 'the', 'house', 'next', 'to', 'the', 'man', 'with', 'the', 'fox', '.', 'Kools', 'are', 'smoked', 'in', 'the', 'house', 'next', 'to', 'the', 'house', 'where', 'the', 'horse', 'is', 'kept', '.', 'The', 'Lucky', 'Strike', 'smoker', 'drinks', 'orange', 'juice', '.', 'The', 'Japanese', 'smokes', 'Parliaments', '.', 'The', 'Norwe

In [8]:
tokenizer = NaiveTokenizer(base_vocabulary)
dataset = NgramsLanguageModelDataSet(5, data, tokenizer)

for ngram in dataset[:5]:
    print(ngram)

[1, 1, 1, 1, 2]
[1, 1, 1, 2, 3]
[1, 1, 2, 3, 4]
[1, 2, 3, 4, 5]
[2, 3, 4, 5, 6]


In [9]:
dataloader = DataLoader(dataset, batch_size=10, shuffle=True, collate_fn=tokenizer.pad_batch)

for batch in dataloader:
    print(batch)
    break

tensor([[10, 11, 12, 13,  6],
        [11, 40, 45, 11, 46],
        [ 7, 38,  9, 10, 11],
        [13, 47, 11, 48, 18],
        [ 1,  1,  1,  1, 33],
        [11, 48, 18, 49,  6],
        [31, 22, 52, 53,  6],
        [ 7, 50, 51, 31, 22],
        [13, 18, 24, 25, 11],
        [ 1,  1,  7, 29, 30]])


## Neural network language model

In [10]:
import torch.nn as nn
from torch.nn.functional import tanh, log_softmax

class NNLM (nn.Module):

    def __init__(self, emb_size, vocab_size, hidden_size, memory_size, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.wordemb = nn.Embedding(vocab_size, emb_size, padding_idx=pad_id)
        self.lm_in = nn.Linear(emb_size * memory_size, hidden_size)
        self.lm_out = nn.Linear(hidden_size, vocab_size)


    def forward(self, X):
        input_embeddings = self.wordemb(X)
        input_embeddings = torch.flatten(input_embeddings, start_dim=1)
        hidden_embeddings = self.lm_in(input_embeddings)
        logits = self.lm_out(tanh(hidden_embeddings))
        return logits


    def __call__(self, batch):
        model_device = next(self.parameters()).device
        batch = batch.to(model_device)

        X = batch[:,:-1]
        Y = batch[:,-1]
        logits = log_softmax(self.forward(X), dim=1)
        target_logits = torch.gather(logits, 1, Y.unsqueeze(1))
        return target_logits, logits


    def train(self, dataloader, epochs, device="cpu"): # type: ignore
        self.to(device)
        cross_entropy = nn.CrossEntropyLoss(ignore_index=self.pad_id)
        optimizer = torch.optim.AdamW(self.parameters(), lr=0.005)

        for epoch in range(1, epochs+1):
            loss_list = []
            for batch in dataloader:
                X =  batch[:,:-1]
                Y =  batch[:,-1]
                X = X.to(device)
                Y = Y.to(device)
                logits = self.forward(X)
                loss = cross_entropy(logits,Y)
                loss.backward()
                loss_list.append(loss.item())
                optimizer.step()
                optimizer.zero_grad()
            print(f'Epoch {epoch} - loss: {sum(loss_list)/len(loss_list)}')

In [11]:
lang_model = NNLM(128, tokenizer.vocab_size, 128, 4, tokenizer.pad_id)
lang_model.train(dataloader, 500, device="cpu")

Epoch 1 - loss: 3.8070131142934165
Epoch 2 - loss: 1.667949672540029
Epoch 3 - loss: 0.9877670119206111
Epoch 4 - loss: 0.6844001819690069
Epoch 5 - loss: 0.5570072010159492
Epoch 6 - loss: 0.5022843984266122
Epoch 7 - loss: 0.4385283832748731
Epoch 8 - loss: 0.4836795919885238
Epoch 9 - loss: 0.40265653530756634
Epoch 10 - loss: 0.4616381339728832
Epoch 11 - loss: 0.427103060670197
Epoch 12 - loss: 0.5838987104594707
Epoch 13 - loss: 0.4171159699248771
Epoch 14 - loss: 0.4085654493421316
Epoch 15 - loss: 0.4199587131539981
Epoch 16 - loss: 0.3909242886429032
Epoch 17 - loss: 0.4026708455135425
Epoch 18 - loss: 0.37846494363620875
Epoch 19 - loss: 0.42805141483743986
Epoch 20 - loss: 0.3653872096290191
Epoch 21 - loss: 0.3914411984073619
Epoch 22 - loss: 0.45805068298553425
Epoch 23 - loss: 0.42039648660769063
Epoch 24 - loss: 0.4824458745618661
Epoch 25 - loss: 0.3839766502380371
Epoch 26 - loss: 0.4372105735819787
Epoch 27 - loss: 0.39532184610143306
Epoch 28 - loss: 0.36386054996401

In [12]:
test_example = ["The Lucky Strike smoker drinks orange juice ."]
test_set = NgramsLanguageModelDataSet(5, test_example, tokenizer)
test_loader = DataLoader(test_set, len(test_set), shuffle=False, collate_fn=tokenizer.pad_batch)

for batch in test_loader:
    print(batch)

tensor([[ 1,  1,  1,  1,  7],
        [ 1,  1,  1,  7, 50],
        [ 1,  1,  7, 50, 51],
        [ 1,  7, 50, 51, 31],
        [ 7, 50, 51, 31, 22],
        [50, 51, 31, 22, 52],
        [51, 31, 22, 52, 53],
        [31, 22, 52, 53,  6]])


In [13]:
for batch in test_loader:
    example = tokenizer.decode_ngram(batch)
    surprisals, logits = lang_model(batch)
    print(example)
    print(list(zip(example.split(),(-(surprisals)).tolist())))

The Lucky Strike smoker drinks orange juice .
[('The', [0.43325895071029663]), ('Lucky', [2.3813390731811523]), ('Strike', [3.361645576660521e-05]), ('smoker', [2.0861407392658293e-05]), ('drinks', [2.13382354559144e-05]), ('orange', [1.2278481335670222e-05]), ('juice', [1.537788011773955e-05]), ('.', [9.775113539944869e-06])]


## Recurrent neural network language model

In [14]:
import torch.nn as nn
from torch.nn.functional import log_softmax

class myLSTM (nn.Module):

    def __init__(self, emb_size, vocab_size, hidden_size, pad_id):
        super().__init__()
        self.pad_id = pad_id
        self.wordemb  = nn.Embedding(vocab_size, emb_size, padding_idx=pad_id)
        self.lstm     = nn.LSTM(emb_size, hidden_size, batch_first=True)
        self.lm_out   = nn.Linear(hidden_size, vocab_size)


    def forward(self, X):
        input_embeddings = self.wordemb(X)
        hidden_embeddings, _ = self.lstm(input_embeddings)
        latest_hidden = hidden_embeddings[:,-1,:]
        logits = self.lm_out(latest_hidden)
        return logits


    def __call__(self, batch):
        model_device = next(self.parameters()).device
        batch = batch.to(model_device)
        X = batch[:,:-1]
        Y = batch[:,-1]
        logits = log_softmax(self.forward(X),dim=1)
        target_logits = torch.gather(logits, 1, Y.unsqueeze(1))
        return target_logits, logits


    def train(self, dataloader, epochs, device="cpu"): # type: ignore
        self.to(device)
        cross_entropy = nn.CrossEntropyLoss(ignore_index=self.pad_id)
        optimizer = torch.optim.AdamW(self.parameters(), lr=0.005)

        for epoch in range(1, epochs+1):
            loss_list = []
            for batch in dataloader:
                X = batch[:,:-1]
                Y = batch[:,-1]
                X = X.to(device)
                Y = Y.to(device)
                logits = self.forward(X)
                loss = cross_entropy(logits,Y)
                loss.backward()
                loss_list.append(loss.item())
                optimizer.step()
                optimizer.zero_grad()
            print(f'Epoch {epoch} - loss: {sum(loss_list)/len(loss_list)}')

In [15]:
lang_model = myLSTM(128, tokenizer.vocab_size, 128, tokenizer.pad_id)
lang_model.train(dataloader, 500, device="cpu")

Epoch 1 - loss: 3.8736895084381104
Epoch 2 - loss: 2.5143013477325438
Epoch 3 - loss: 1.6256032148996988
Epoch 4 - loss: 0.9559770107269288
Epoch 5 - loss: 0.7887051602204641
Epoch 6 - loss: 0.6718011431396007
Epoch 7 - loss: 0.7019922492404779
Epoch 8 - loss: 0.6077285120884578
Epoch 9 - loss: 0.539241760969162
Epoch 10 - loss: 0.4451027954618136
Epoch 11 - loss: 0.4117100667208433
Epoch 12 - loss: 0.3994343546529611
Epoch 13 - loss: 0.45845330599695444
Epoch 14 - loss: 0.38669197261333466
Epoch 15 - loss: 0.45587313249707223
Epoch 16 - loss: 0.39148643041650455
Epoch 17 - loss: 0.370709247328341
Epoch 18 - loss: 0.35859244167804716
Epoch 19 - loss: 0.44963212410608927
Epoch 20 - loss: 0.46191677736739317
Epoch 21 - loss: 0.383481527864933
Epoch 22 - loss: 0.3758825108408928
Epoch 23 - loss: 0.4246516951049368
Epoch 24 - loss: 0.40299644231175386
Epoch 25 - loss: 0.3588290801892678
Epoch 26 - loss: 0.37271621599793436
Epoch 27 - loss: 0.3552501761664947
Epoch 28 - loss: 0.347925040017

In [16]:
for batch in test_loader:
    example = tokenizer.decode_ngram(batch)
    surprisals, logits = lang_model(batch)
    print(example)
    print(list(zip(example.split(),(-(surprisals)).tolist())))

The Lucky Strike smoker drinks orange juice .
[('The', [0.5071532726287842]), ('Lucky', [2.4998905658721924]), ('Strike', [1.764281842042692e-05]), ('smoker', [1.2397689715726301e-05]), ('drinks', [1.3828182090946939e-05]), ('orange', [9.536697689327411e-06]), ('juice', [8.583032467868179e-06]), ('.', [2.264974000354414e-06])]


## Universal dependencies datasets

In [17]:
import conllu

def load_conllu(filename):
    for sent in conllu.parse_incr(open(filename, "r", encoding="utf-8")):
        ids = [token["id"] for token in sent]
        deps = [token["head"] for token in sent]
        pos = [token["upos"] for token in sent]
        tokens = [token["form"] for token in sent]
        yield ids, deps, pos, tokens

In [18]:
import os
import glob

def load_from_file(data_dir="data"):
    datasets = {}
    for path in glob.glob(os.path.join(data_dir, "*.conllu")):
        lang = os.path.splitext(os.path.basename(path))[0]
        datasets[lang] = list(load_conllu(path))
        print(f"Loaded {lang}")
    return datasets

language_datasets = load_from_file("data")

Loaded latin
Loaded greek
Loaded turkish
Loaded hungarian
Loaded polish
Loaded arabic
Loaded mandarin
Loaded english
Loaded german
Loaded french


In [19]:
print(language_datasets["english"][0])

([1, 2, 3, 4, 5, 6], [2, 0, 5, 5, 2, 2], ['ADJ', 'NOUN', 'CCONJ', 'ADJ', 'NOUN', 'PUNCT'], ['Aesthetic', 'Appreciation', 'and', 'Spanish', 'Art', ':'])


## Random and projective shuffles

In [20]:
from random import shuffle
from typing import List

class DependencyTree:

    def __init__(self, tokens=None, edges=None):
        self.edges = [] if edges is None else edges
        self.tokens = [("ROOT", "ROOT")] if tokens is None else tokens

    @staticmethod
    def read(data):
        trees = []
        for ids, deps, upos, tokens in data:
            dep_tree = DependencyTree()
            
            for token, pos in zip(tokens, upos):
                dep_tree.tokens.append((token, pos)) # type: ignore
            
            for gov, child in zip(deps, ids):
                dep_tree.edges.append((gov, child))
            
            trees.append(dep_tree)
        return trees


    def copy(self):
        return DependencyTree(self.tokens[:], self.edges[:])


    def shuffle_random(self, new_order=None):
        original_order = list(range(len(self.tokens)))

        if new_order:
            target_order = new_order
            original_order, target_order = target_order, original_order
        else:
            target_order = original_order[1:]
            shuffle(target_order)
            target_order = [0] + target_order

        dependency_mappings = dict(zip(original_order, target_order))

        self.edges = [
            (dependency_mappings[gov], dependency_mappings[dep])
            for gov, dep in self.edges
        ]

        new_tokens = [None] * len(self.tokens)
        new_tokens[0] = self.tokens[0] # type: ignore

        for original_order in range(len(self.tokens)):
            new_tokens[dependency_mappings[original_order]] = self.tokens[original_order] # type: ignore
        self.tokens = new_tokens


    def shuffle_projective(self):
        children = {}
        for gov, dep in self.edges:
            deps = children.get(gov, [])
            deps.append(dep)
            children[gov] = deps

        for key in children:
            if key == 0:
                shuffle(children[key])
                children[key] = [0] + children[key]
            else:
                children[key].append(key)
                shuffle(children[key])

        stack = set([])
        new_order = [0]
        while len(new_order) < len(self.tokens):
            for idx, el in enumerate(new_order):
                if el not in stack:
                    stack.add(el)
                    left, right = new_order[:idx], new_order[idx+1:]
                    new_order = left + children.get(el, [el]) + right
                    break

        assert(len(new_order) == len(self.tokens))
        expected = set(range(len(self.tokens)))
        actual = set(new_order)
        assert(expected == actual)

        self.shuffle_random(new_order)

In [21]:
trees = DependencyTree.read(language_datasets["english"])

In [37]:
print(trees[22].tokens)
print(trees[22].edges)

[('ROOT', 'ROOT'), ('Previous', 'ADJ'), ('eye', 'NOUN'), ('tracking', 'NOUN'), ('research', 'NOUN'), ('has', 'AUX'), ('highlighted', 'VERB'), ('the', 'DET'), ('potential', 'NOUN'), ('to', 'PART'), ('transform', 'VERB'), ('the', 'DET'), ('ways', 'NOUN'), ('we', 'PRON'), ('understand', 'VERB'), ('visual', 'ADJ'), ('processing', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('arts', 'NOUN'), ('(', 'PUNCT'), ('see', 'VERB'), ('for', 'ADP'), ('example', 'NOUN'), ('Brieber', 'PROPN'), ('2014', 'NUM'), (';', 'PUNCT'), ('Binderman', 'PROPN'), ('et', 'X'), ('al.', 'X'), (',', 'PUNCT'), ('2005', 'NUM'), (')', 'PUNCT'), ('and', 'CCONJ'), ('at', 'ADP'), ('the', 'DET'), ('same', 'ADJ'), ('time', 'NOUN'), ('offers', 'VERB'), ('a', 'DET'), ('direct', 'ADJ'), ('way', 'NOUN'), ('of', 'SCONJ'), ('studying', 'VERB'), ('several', 'ADJ'), ('important', 'ADJ'), ('factors', 'NOUN'), ('of', 'ADP'), ('a', 'DET'), ('museum', 'NOUN'), ('visit', 'NOUN'), ('(', 'PUNCT'), ('Filippini', 'PROPN'), ('Fantoni', 'PROPN'), ('e

In [23]:
def shuffle_trees(trees: List[DependencyTree]):
    projective = []
    for tree in trees:
        copied_tree = tree.copy()
        copied_tree.shuffle_projective()
        projective.append(copied_tree)

    random = []
    for tree in trees:
        copied_tree = tree.copy()
        copied_tree.shuffle_random()
        random.append(copied_tree)

    return projective, random

In [49]:
test = shuffle_trees(trees[20:21])

In [46]:
test[1][0].tokens

[('ROOT', 'ROOT'),
 (';', 'PUNCT'),
 ('and', 'CCONJ'),
 (')', 'PUNCT'),
 ('et', 'X'),
 ('the', 'DET'),
 (',', 'PUNCT'),
 ('Fantoni', 'PROPN'),
 ('arts', 'NOUN'),
 ('tracking', 'NOUN'),
 ('ways', 'NOUN'),
 ('of', 'SCONJ'),
 ('Previous', 'ADJ'),
 (';', 'PUNCT'),
 ('Brieber', 'PROPN'),
 ('at', 'ADP'),
 ('we', 'PRON'),
 ('.', 'PUNCT'),
 ('potential', 'NOUN'),
 ('al.', 'X'),
 ('several', 'ADJ'),
 (',', 'PUNCT'),
 ('(', 'PUNCT'),
 ('visual', 'ADJ'),
 ('the', 'DET'),
 ('(', 'PUNCT'),
 ('Heidenreich', 'PROPN'),
 ('2010', 'NUM'),
 ('of', 'ADP'),
 ('way', 'NOUN'),
 ('understand', 'VERB'),
 ('visit', 'NOUN'),
 ('highlighted', 'VERB'),
 ('has', 'AUX'),
 ('for', 'ADP'),
 ('museum', 'NOUN'),
 ('important', 'ADJ'),
 ('Filippini', 'PROPN'),
 ('direct', 'ADJ'),
 ('eye', 'NOUN'),
 ('research', 'NOUN'),
 ('same', 'ADJ'),
 ('example', 'NOUN'),
 ('factors', 'NOUN'),
 ('a', 'DET'),
 ('transform', 'VERB'),
 ('al.', 'X'),
 ('2011', 'NUM'),
 ('studying', 'VERB'),
 ('2013', 'NUM'),
 ('Binderman', 'PROPN'),
 (')