In [52]:
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F

torch.set_printoptions(sci_mode=False)
device = 'cuda:0'

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace, Sequence, ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.trainers import BpeTrainer

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(special_tokens=["[UNK]", "[START]", "[END]", "[PAD]"], vocab_size=16384)

files = ["corpus150.txt"]
tokenizer.train(files, trainer)

#tokenizer.encode('[START] A duck is a carnivorous animal')

In [4]:
import os
import random
import codecs
from timeit import timeit

class Data:
    def __init__(self):
        filename = 'corpus150.txt'
        
        self.file = codecs.open(filename, 'r', encoding='utf-8', errors='ignore')
        self.file_length = os.stat(filename).st_size
        
        print('Loaded dataset file of size', self.file_length)
        
    def sample_batch(self, n=32, length=240):
        # sample a lot of strings of certain length
        strs = []
        for i in range(n):
            self.file.seek(random.randrange(0, self.file_length - length))
            strs.append(self.file.read(length))
        
        # encode with tokenizer
        x = [encoding.ids for encoding in tokenizer.encode_batch(strs)]
        
        # shorten the long ones
        min_len = min(map(len, x))
        x = [ids[0:min_len] for ids in x]
        
        # put it into pytorch preferred format (torch.tensor, with shape (sequence, batch))
        x = torch.tensor(x)
        x = x.transpose(1, 0)
        
        return x
        
dataset = Data()

#timeit(dataset.sample_batch, number=100) / 100

Loaded dataset file of size 1537774


In [5]:
#(torch.rand((2,2)) > 0.8).float() * torch.ones()

In [5]:
def checker_board(d_model):
    half = (d_model) // 2
    texture = torch.cat([
        torch.ones((half, 1)),
        torch.zeros((half, 1))
    ], dim=1).view((-1,))
    
    return texture

print(checker_board(8))
print(-checker_board(8) + 1)

tensor([1., 0., 1., 0., 1., 0., 1., 0.])
tensor([0., 1., 0., 1., 0., 1., 0., 1.])


In [6]:
def pos_embedding(x):
        # x: (pos, n, i)
        
        length = x.shape[0]
        batch_size = x.shape[1]
        d_model = x.shape[2]

        i = torch.arange(0, d_model).view((1, 1, -1)).expand(length, -1, d_model).to(device).float()
        pos = torch.arange(0, length).view((-1, 1, 1)).expand(length, -1, d_model).to(device).float()
        
        z = pos / 10000 ** (i / d_model)
        
        sin = torch.sin(z)
        cos = torch.cos(z)
        
        sin_mask = checker_board(d_model).to(device)
        cos_mask = -sin_mask + 1
                
        pe = (sin_mask * sin) + (cos_mask * cos)
        pe = pe.expand(length, batch_size, d_model)
        
        return x + pe

In [62]:
class Model(nn.Module):
    def __init__(self, dropout=0.1, embedding_dim=512, heads=8, num_layers=6):
        super(Model, self).__init__()
        # config
        self.dropout = dropout
        self.embedding_dim = embedding_dim
        self.heads = heads
        self.num_layers = num_layers
        
        self.start_token = torch.tensor([[1]]).to(device)
        
        # layers
        self.embedding = nn.Embedding(num_embeddings=16384, embedding_dim=embedding_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, heads, dim_feedforward=2048, dropout=dropout)
        decoder_layer = nn.TransformerDecoderLayer(embedding_dim, heads, dim_feedforward=2048, dropout=dropout)
        
        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.decoder = torch.nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        
        self.unembedding = nn.Linear(512, 16384)
        self.unembedding.weight.data = self.embedding.weight.data
    
    def forward(self, x):
        x = self.embedding(x)
        
        source = pos_embedding(x)
        target = pos_embedding(torch.cat([
            self.embedding(self.start_token).expand(1, source.shape[1], -1),
            x
        ], dim=0)[:-1])
        
        source_mask = (torch.rand((source.shape[0], source.shape[1], 1)) > 0.1).float().expand(-1, -1, self.embedding_dim).to(device)
        source = source * source_mask
        
        target_mask = torch.triu(torch.full((target.shape[0], target.shape[0]), float('-inf')), diagonal=1).to(device)
        
        memory = self.encoder(source)
        output = self.decoder(target, memory, tgt_mask=target_mask)
        
        output = self.unembedding(output)
        
        return output
    
    def use(self, source, duration=20):
        source = pos_embedding(self.embedding(source))
        memory = self.encoder(source)
        
        target = [1]
        
        for i in range(duration):
            target_embedded = pos_embedding(self.embedding(torch.tensor(target, dtype=torch.long).view((-1, 1)).to(device)))
            output = self.unembedding(self.decoder(target_embedded, memory))[-1, 0]
            output = torch.distributions.categorical.Categorical(probs=F.softmax(output, dim=0)).sample()
            #print(output_dist)
            target.append(output.item())
        
        return target
        
    def beam(self, source, k=15, length=30):
        # encode
        source = self.embedding(source)
        source = pos_embedding(source)
        memory = self.encoder(source)
        # copy memory for each k we'll be processing
        memory = memory.expand(-1, k, -1)
        
        # keep [k] active branches of [length]
        target = torch.zeros((length, k), dtype=torch.long).to(device)
        scores = torch.zeros((length, k)).to(device)
        # every branches root is a start token
        target[0, :] = 1
        scores[0, :] = 0
        
        # decode
        for i in range(1, length):
            # make predictions for each current branch
            target_embedding = pos_embedding(self.embedding(target[:i]))
            output = self.unembedding(self.decoder(target_embedding, memory))[-1:]
            
            # find [k] best branches from each current branch.
            y = torch.topk(output, k=k, dim=2)
            
            # find the value of current branches as they are
            current_evidence = scores[:i].sum(dim=0).view((1, k)).unsqueeze(2).expand(1, -1, k)
            
            # add that to the value of each possible branch for each current branch
            branch_evidence = current_evidence + y.values
            
            # decide which possibilities should be leaf of the new [k] current branches based on the highest total value
            y_topk = torch.topk(branch_evidence.view((-1)), k=k)
            
            # find out which current root branch the new leafs belong to
            current_k_root = y_topk.indices // k
            
            # replace target & scores with the best leafs, preceded by their current branch
            new_target = torch.cat(
                [
                    target[0:i, current_k_root], # current branch that the leaf comes from
                    y.indices.view((-1,))[y_topk.indices].view((1, k)) # leaf
                ],
                dim=0
            )
            new_scores = torch.cat(
                [
                    scores[0:i, current_k_root],
                    y.values.view((-1,))[y_topk.indices].view((1, k))
                ],
                dim=0
            )
            
            target[:i+1] = new_target
            scores[:i+1] = new_scores
            
        return target[:, scores.sum(dim=0).argmax()]
            
        
        
model = Model().to(device)
#model.load_state_dict(torch.load('state_dicts/bert6'))


In [59]:
def lr(i, warmup_steps, model_params):
    i += 0.001
    base = model_params ** -0.5
    
    warmup = i * (warmup_steps ** -1.5)
    general = i ** -0.5
    
    return base * np.minimum(warmup, general)

optimizer = optim.Adam(model.parameters(), lr=1, weight_decay=0.1)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda i: lr(i, warmup_steps=4000, model_params=512))


In [None]:
from IPython.display import clear_output


#torch.cuda.empty_cache()
for i in range(10000):
    optimizer.zero_grad()
    x = dataset.sample_batch(n=32, length=400).to(device)
    
    y = model.forward(x)
    
    #print(y[1,0])
    
    loss = nn.CrossEntropyLoss()(y.view((-1, 16384)), x.reshape((-1)))
    loss.backward()
    optimizer.step()
    
    print(optimizer.param_groups[0]['lr'])
    print('loss', loss)    
    #print(x.shape, y.shape)
    #print(x[0:5, 0], y[0:5, 0].argmax(dim=1))
    print(tokenizer.decode(x[:, 0].tolist()))
    print('================================')
    print(tokenizer.decode(y[:, 0].argmax(dim=1).tolist()))
    clear_output(wait=True)
    
    scheduler.step()
    #break

0.0006208584240704868
loss tensor(92.8548, device='cuda:0', grad_fn=<NllLossBackward>)
 lub plays in the English Premier League and is owned by a company from Dubai. As of September 1, 2008 they are the richest club in the English Premier League.
The manager of the club is the Welshman Mark Hughes who replaced Sven-Göran Eriksson. They finished 9th place in the 2007-08 Premier League
 exists90igsigs pubimchiigs Premierigsigs Branimchi Jainigs90imchi Jain90igs Advertigsigs domesticated Branimchi90909090 Bran909090 Premierlanca9090 Bran909090imchiigs Bran9090igs90 Bran Bran90lanca Bran Bran9090igslast Bran9090igs90imchi Branigs90 Branigs 20079090 Premier


In [19]:
torch.save(model.state_dict(), 'state_dicts/bert6')

In [48]:
torch.cuda.empty_cache()
tokenizer.decode(
    model.beam(
        torch.tensor(tokenizer.encode('England is a [UNK] country').ids, dtype=torch.long).view((-1, 1)).to(device),
        k=15,
        length=20
    ).tolist()
)

'. of of\n. is.... by., of of\n is is.'