In [1]:
import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset

from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from library_model import layers as lay
from library_model import model_building as mb
from data import data_loading as dt
from data import text as txt
import numpy as np
import random

In [2]:
train_iter = WikiText2(split = "train")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Helper functions for processing data 

In [3]:
seq_length = 35
batch_size = 20
def get_dataloader(data, seq_length, batch_size):
    data = data[:(len(data)//seq_length)*seq_length].view(-1,seq_length) #organizes flat data tensor to fixed-length sequences
    return dt.get_tranformer_dataloader(data, data, batch_size, shuffle_batch=True) #creates dataloader with random shuffling and fixed batch size

def process_data(train_iter, coders_cls, seq_length, batch_size):
    coders = coders_cls(train_iter)  #generates vocab, contains tokenizer, text encoders and decoders
    text_data = " ".join(list(item for item in train_iter))  #merges text items of Wikitext2 generator to form single text
    train_data = coders.text_encoding(text_data).view(-1).to(device)  #encodes text into flat tensor and sends it to device
    return coders, get_dataloader(train_data, seq_length, batch_size)  

library vs my tokenizers & encoders

In [4]:
lib_coders, lib_train_dataloader = process_data(train_iter, txt.library_text_coders, seq_length, batch_size)
my_coders, my_train_dataloader = process_data(train_iter, txt.My_word_tokenizer, seq_length, batch_size)

Some more helpful definitions

In [5]:
l_encode = lib_coders.text_encoding
l_decode = lib_coders.text_decoding
m_encode = my_coders.text_encoding
m_decode = my_coders.text_decoding

Defining hyperparameters

In [6]:
vocab = my_coders.tokens
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2
ending_wrd= my_coders.token_encoding["."]

In [7]:
#Choosing most frequent word as starting vector for autoregression
from collections import Counter
 
def most_frequent(List):
    occurence_count = Counter(List)
    return occurence_count.most_common(1)[0][0]

starting_wrd = my_coders.token_encoding[most_frequent((" ".join([elem for elem in train_iter])).split())]
starting_wrd = torch.cat([starting_wrd, starting_wrd],dim=1) #I'm doubling the starting vector due to some kind of bug in my attention code

Initiating a model instance

In [8]:
model, opt = mb.get_registered_Transformer_model(in_vocab_size=ntokens, out_vocab_size=ntokens, dim_in = emsize, dim_key= emsize//nhead, heads = nhead, dim_internal=d_hid, copies = nlayers, lr = 1., start=starting_wrd, end = ending_wrd)
#scheduler = torch.optim.lr_scheduler.StepLR(opt, 1.0, gamma=0.95)
model_operate = mb.NN_operating_tools(model, opt, learning_rate_schedule= None)

Untrained network acting on a random training example

In [9]:
([x_en, x_de], y) = next(iter(my_train_dataloader))
m_decode(x_en[5])

'. Pablo <unk> , who jointly ran Radio El Mundo with Eva Duarte , is said to have not liked her , but to have noted that she was'

In [10]:
out, _ =model([x_en,x_de])
m_decode(out[2])

'copy quarrel analysis sinking quarrel chap protons copy copy copy analysis copy copy protons chap analysis protons analysis Fernandez protons copy protons protons protons protons loan relocated analysis copy copy copy copy protons copy'

In [17]:
m_decode(model.autoregression(x_en[0].unsqueeze(0), 34, mode="beam"))

'the the copy relocated chemical 364 Fernandez protons protons analysis copy protons protons protons analysis protons protons analysis relocated analysis sonic protons protons copy intents analysis protons protons copy analysis copy analysis copy copy protons copy'

Training

In [15]:
model_operate.fit_transformer(15, my_train_dataloader, tokenizer = my_coders, sample_input=x_en[0].unsqueeze(0))

KeyboardInterrupt: 