In [1]:
from collections import Counter
import contextlib
from contextlib import ExitStack
from livelossplot import PlotLosses
from tabulate import tabulate
from IPython.display import display, Markdown
from ilonimi import Vocabulary, Normalizer, Tokenizer, Splitter
import random as rd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, Sampler, DataLoader
from torch.nn.utils.rnn import pad_sequence as pad

In [2]:
vocab = Vocabulary()
normalizer = Normalizer()
tokenizer = Tokenizer(
    convert_unk = True,
    convert_number = False,
    convert_proper = False)
splitter = Splitter(sharp = False)

path_list = [
    '../tokipona-corpus-collection/100tokipona/100tokipona.txt',
    '../tokipona-corpus-collection/tokipona1000/tokipona1000.txt',
    '../tokipona-corpus-collection/tatoeba/tatoeba.txt']

def preproc(sent):
    sent = sent.strip()
    sent = normalizer(sent)
    sent = tokenizer(sent)
    sent = splitter(sent)
    return sent
    
with ExitStack() as stack:
    sents = [
        preproc(sent)
        for path
        in path_list
        for sent
        in stack.enter_context(open(path))]
    
sents = [
    sent
    for sent
    in sents
    if len(sent.split()) <= 40]

rd.seed(100)
rd.shuffle(sents)

train_sents = sents[:-4000]
valid_sents = sents[-4000:-2000]
test_sents = sents[-2000:]
print(len(train_sents))
print(len(valid_sents))
print(len(test_sents))

freq = Counter([
    word
    for sent
    in train_sents
    for word
    in sent.split()
]).most_common()
tokens = [w for w, f in freq if w != '<unk>']
tokens = ['<pad>', '<bos>', '<eos>', '<unk>'] + tokens
print(len(tokens))

43036
2000
2000
240


In [3]:
class Vocab:
    
    def __init__(self, tokens):
        self.tokens = tokens
        self.token_dict = {token: index for index, token in enumerate(tokens)}
        self.pad = self.token_dict['<pad>']
        self.bos = self.token_dict['<bos>']
        self.eos = self.token_dict['<eos>']
        self.unk = self.token_dict['<unk>']
    
    def __contain__(self, x):
        return x in self.token_dict
    
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, x):
        return self.tokens[x]
    
    def __call__(self, x):
        if x in self:
            return self.token_dict[x]
        return self.unk

In [4]:
class LMBatch:
    
    def __init__(self, di, do=None, dl=None, dpm=None):
        self.decoder_inputs = di
        self.decoder_outputs = do
        self.decoder_lengths = dl
        self.decoder_padding_mask = dpm

    def __len__(self):
        return self.decoder_inputs.size(-1)

    def cuda(self):
        self.decoder_inputs = self.decoder_inputs.cuda()
    
        if self.decoder_outputs is not None:
            self.decoder_outputs = self.decoder_outputs.cuda()
    
        if self.decoder_padding_mask is not None:
          self.decoder_padding_mask = self.decoder_padding_mask.cuda()
        return self

In [5]:
class LMDataset(Dataset):
    
    def __init__(self, sents, vocab):
        self.sents = sents
        self.vocab = vocab
        self.lengths = torch.tensor([len(sent) + 1 for sent in sents])
    
    def __len__(self):
        return len(self.sents)
    
    def __getitem__(self, index):
        return self.sents[index]
    

class LMCollator:
    
    def __init__(self, vocab):
        self.vocab = vocab
    
    def __call__(self, batch):
        di = pad([torch.tensor([self.vocab.bos] + sent) for sent in batch], padding_value = self.vocab.pad)
        do = pad([torch.tensor(sent + [self.vocab.eos]) for sent in batch], padding_value = self.vocab.pad)
        dpm = (di == self.vocab.pad)
        batch = LMBatch(di, do = do, dpm = dpm)
        return batch

In [6]:
class LMSampler(Sampler):
    
    def __init__(self, dataset, max_tokens):
        self.dataset = dataset
        self.max_tokens = max_tokens

    def __iter__(self):
        self.indices = torch.randperm(len(self.dataset))
        self.indices = self.indices[self.dataset.lengths[self.indices].argsort(descending=True)]
        for batch in self.generate_batches():
            yield batch

    def generate_batches(self):
        batches = []
        batch = []
        acc = 0
        max_len = 0
        for index in self.indices:
            acc += 1
            this_len = self.dataset.lengths[index]
            max_len = max(max_len, this_len)
            if (acc * max_len) > self.max_tokens:
                batches.append(batch)
                batch = [index]
                acc = 1
                max_len = this_len
            else:
                batch.append(index)
        if batch != []:
            batches.append(batch)
        rd.shuffle(batches)
        return batches

In [7]:
class SinusoidalPositionalEmbedding(nn.Module):
    
    def __init__(self, d_model, max_len = 128, denom = 10000.0):
        super().__init__()
        self.embedding = nn.Embedding(max_len, d_model).requires_grad_(False)
        pos = torch.arange(0.0, max_len).unsqueeze(-1)
        div = torch.exp(-torch.arange(0, d_model, 2.0) / d_model * torch.log(torch.tensor(denom))) 
        self.embedding.weight[:, 0::2] = torch.sin(div * pos)
        self.embedding.weight[:, 1::2] = torch.cos(div * pos)

    def forward(self, x):
        return self.embedding(x)


class TransformerEmbedding(nn.Module):
    
    def __init__(self, d_vocab, d_model, dropout, padding_idx = 0, max_seq_len = 128):
        super().__init__()
        self.token_embedding = nn.Embedding(d_vocab, d_model, padding_idx = 0)
        self.position_embedding = SinusoidalPositionalEmbedding(d_model, max_seq_len, 10000.0)
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, position_ids=None):
        x = self.token_embedding(x)

        if position_ids is None:
            position_ids = torch.arange(x.size(0), device=x.device).unsqueeze(-1)
        x = x + self.position_embedding(position_ids)

        x = self.norm(x)
        x = self.dropout(x)
        return x


class SelfAttentionSubLayer(nn.Module):
    
    def __init__(self, d_model, nhead, dropout, attention_dropout):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout = attention_dropout)
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask, padding_mask):
        z = self.norm(x)
        z, _ = self.self_attn(z, z, z, attn_mask = mask, key_padding_mask = padding_mask)
        x = x + self.dropout(z)
        return x

    
class FeedForwardSubLayer(nn.Module):
    
    def __init__(self, d_model, dim_feedforward, dropout):
        super().__init__()
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()

    def forward(self, x):
        z = self.norm(x)
        z = self.linear1(z)
        z = self.activation(z)
        z = self.dropout(z)
        z = self.linear2(z)
        x = x + self.dropout(z)
        return x


class TransformerLMLayer(nn.Module):
    
    def __init__(self, d_model, nhead, dim_feedforward, dropout, activation_dropout, attention_dropout):
        super().__init__()
        self.self_attn_layer = SelfAttentionSubLayer(d_model, nhead, dropout, attention_dropout)
        self.feed_forward_layer = FeedForwardSubLayer(d_model, dim_feedforward, activation_dropout)

    def forward(self, x, attn_mask = None, padding_mask = None):
        x = self.self_attn_layer(x, attn_mask, padding_mask)
        x = self.feed_forward_layer(x)
        return x


class TransformerLMDecoder(nn.Module):
    
    def __init__(self, d_model, nhead, dim_feedforward, dropout, activation_dropout, attention_dropout, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerLMLayer(d_model, nhead, dim_feedforward, dropout, activation_dropout, attention_dropout)
            for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        x = self.norm(x)
        return x


class TransformerLM(nn.Module):
    
    def __init__(self, d_vocab, d_model, nhead, dim_feedforward, dropout, activation_dropout, attention_dropout, num_layers):
        super().__init__()
        self.embedding = TransformerEmbedding(d_vocab, d_model, dropout)
        self.decoder = TransformerLMDecoder(d_model, nhead, dim_feedforward, dropout, activation_dropout, attention_dropout, num_layers)
        self.proj = nn.Linear(d_model, d_vocab)

    def forward(self, x):
        x = self.embedding(x)
        x = self.decoder(x)
        x = self.proj(x)
        return x

In [8]:
def sents_to_data(vocab, sents):
    def sent_to_data(sent):
        sent = sent.split()
        sent = [vocab(token) for token in sent]
        return sent
    return [sent_to_data(sent) for sent in sents]

vocab = Vocab(tokens)
train_data = sents_to_data(vocab, train_sents)
valid_data = sents_to_data(vocab, valid_sents)
test_data = sents_to_data(vocab, test_sents)

In [9]:
max_tokens = 4000
collator = LMCollator(vocab)
train_dataset = LMDataset(train_data, vocab)
valid_dataset = LMDataset(valid_data, vocab)
train_sampler = LMSampler(train_dataset, max_tokens)
valid_sampler = LMSampler(valid_dataset, max_tokens)
train_loader = DataLoader(train_dataset, batch_sampler = LMSampler(train_dataset, max_tokens), collate_fn = collator)
valid_loader = DataLoader(valid_dataset, batch_sampler = LMSampler(valid_dataset, max_tokens), collate_fn = collator)
model = TransformerLM(len(vocab), 256, 8, 512, 0.3, 0.2, 0.2, 6)
model.cuda()
sum(p.numel() for p in model.parameters())

3319536

In [23]:
np.set_printoptions(threshold=1000000)


In [24]:
clip_norm = 0.1
num_steps = 0
for epoch in range(1):
    model.train()
    for batch in train_loader:
        batch.cuda()
        print(batch.decoder_inputs.cpu().detach().numpy())
        #with torch.no_grad():
        #    pred = model()

[[  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1 

[[  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1 

[[  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1 

[[  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1]
 [  8  18  23  18   7  12   7  18   8   7   8  18  18   8  30  12   8  12
    8   8   8  90  18   8  31  18   7   8  51   7  12  18  18   8   8  21
    8  99  41  18   7   8   8  33   7  12   8   8  12   7   8   8  12

[[  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
    1   1   1   1   1   1   1   1   1 

In [None]:
train_data