In [1]:
IN_COLAB = 'google.colab' in str(get_ipython())
TRAIN = True

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    %cd /content/drive/MyDrive/Documents/HLML/abstracts/local_tz/

Mounted at /content/drive
/content/drive/MyDrive/Documents/HLML/abstracts/local_tz


In [2]:
!python -m spacy download en_core_web_lg

import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.4 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=4ff4b59f74660614a84b06d2e907f0bf1c3eb22c6c038888769a40c2423e07b7
  Stored in directory: /tmp/pip-ephem-wheel-cache-tmnogkdo/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
## manually constructing vocabulary
class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    BOS_token = 1   # Beginning-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        self.num_sentences = 0
        self.longest_sentence = 0

        ## add PAD, BOS, EOS tokens:
        self.word2index['<PAD>'] = self.num_words
        self.word2count['<PAD>'] = 1
        self.index2word[self.num_words] = '<PAD>'
        self.num_words += 1

        self.word2index['<BOS>'] = self.num_words
        self.word2count['<BOS>'] = 1
        self.index2word[self.num_words] = '<BOS>'
        self.num_words += 1

        self.word2index['<EOS>'] = self.num_words
        self.word2count['<EOS>'] = 1
        self.index2word[self.num_words] = '<EOS>'
        self.num_words += 1

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 0 
        for word in [token.text for token in sentence]:
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            self.longest_sentence = sentence_len
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [4]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence

class CustomTextDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

In [6]:
def collate_batch(batch):
    label_list, text_list = [], []
    for _sample in batch:
        label_list.append(torch.tensor(_sample[:-1])) # data
        text_list.append(torch.tensor(_sample[1:])) # trg
    return pad_sequence(label_list, padding_value=0.0), pad_sequence(text_list, padding_value=0.0)

batch_size = 64

def create_iterators(batch_size=batch_size):
    """Heler function to create the iterators"""
    dataloaders = []
    for split in [train_list, validation_list, test_list]:
        dataloader = DataLoader(
            split, batch_size=batch_size,
            collate_fn=collate_batch
            )
        dataloaders.append(dataloader)
    return dataloaders

In [7]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from misc_functions import attention, make_std_mask
from gpt_model import *
import math, copy, time

In [8]:
from torchtext.datasets import WikiText2
train_iter, val_iter, test_iter = WikiText2(split=('train', 'valid', 'test'))

Input_list = []

voc_w2=Vocabulary('wikitext2')
for i, item in enumerate(train_iter):
  if len(item) > 2: ## ignore the newlines
    sent = nlp.tokenizer(item.lower().strip())
    voc_w2.add_sentence(sent)
    Input_list.append([voc_w2.to_index("<BOS>")]+[voc_w2.to_index(token.text) for token in sent[:35]]+[voc_w2.to_index("<EOS>")])

for i, item in enumerate(val_iter):
  if len(item) > 2: ## ignore the newlines
    sent = nlp.tokenizer(item.lower().strip())
    voc_w2.add_sentence(sent)
    Input_list.append([voc_w2.to_index("<BOS>")]+[voc_w2.to_index(token.text) for token in sent[:35]]+[voc_w2.to_index("<EOS>")])

for i, item in enumerate(test_iter):
  if len(item) > 2: ## ignore the newlines
    sent = nlp.tokenizer(item.lower().strip())
    voc_w2.add_sentence(sent)
    Input_list.append([voc_w2.to_index("<BOS>")]+[voc_w2.to_index(token.text) for token in sent[:35]]+[voc_w2.to_index("<EOS>")])

wikitext2_dataset = CustomTextDataset(Input_list)
data_len = len(Input_list)
print(data_len)
train_list, validation_list, test_list = random_split(wikitext2_dataset, [int(data_len*0.72), int(data_len*0.1), data_len-(int(data_len*0.72)+int(data_len*0.1))], generator=torch.Generator().manual_seed(42))

100%|██████████| 4.48M/4.48M [00:00<00:00, 6.86MB/s]


29119


In [9]:
def make_model(vocab, N=12, 
			   d_model=512, d_ff=2048, h=8, dropout=0.1):
	"""Helper: Construct a model from hyperparameters."""

	## returns EncoderDecoder object
	c = copy.deepcopy
	attn = MultiHeadedAttention(h, d_model)
	ff = PositionwiseFeedForward(d_model, d_ff, dropout)
	position = PositionalEncoding(d_model, dropout)
	model = GPT(Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N),
		## Sequential passes input to the forward() method in the first module it stores
		## and then "chains" outputs to inputs sequentially for subsequent modules,
		nn.Sequential(Embeddings(d_model, vocab), c(position)),
		Generator(d_model, vocab))
	
	# This was important from their code. 
	# Initialize parameters with Glorot / fan_avg.
	for p in model.parameters():
		if p.dim() > 1:
			nn.init.xavier_uniform_(p) # what does this do? How does it modify model?
	return model

In [13]:
class LabelSmoothing(nn.Module):
	# "Implement label smoothing."
	def __init__(self, size, padding_idx, smoothing=0.0):
		super(LabelSmoothing, self).__init__()
		self.criterion = nn.KLDivLoss(size_average=False) # Kullback-Leibler divergence loss
		self.padding_idx = padding_idx
		self.confidence = 1.0 - smoothing
		self.smoothing = smoothing
		self.size = size
		self.true_dist = None
		
	def forward(self, x, target):
		assert x.size(1) == self.size
		true_dist = x.data.clone()
		true_dist.fill_(self.smoothing / (self.size - 2))
		true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
		true_dist[:, self.padding_idx] = 0
		mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
		if mask.dim() > 0:
			true_dist.index_fill_(0, mask.squeeze(), 0.0)
		self.true_dist = true_dist.requires_grad_(False)

		return self.criterion(x, true_dist)
  
  
class SimpleLossCompute:
	# "A simple loss compute and train function."
	def __init__(self, generator, criterion, opt=None):
		self.generator = generator
		self.criterion = criterion # LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
		self.opt = opt # NoamOpt(model.src_embed[0].d_model, 1, 400, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
		
	def __call__(self, x, y, norm):
		x = self.generator(x) # x is output, each element now in d_vocab dimensions, shape = [30, 9, 11]
							  # y is batch.trg_y (first column of 1s removed), shape = [30, 9]
							  # norm is batch.ntokens (270)
		
		loss = self.criterion(x.contiguous().view(-1, x.size(-1)), # shape = [270, 11]
							  y.contiguous().view(-1)) / norm # shape = [270]
		# print("Label Smoothing called")
		loss.backward()
		if self.opt is not None:
			self.opt.step()
			self.opt.zero_grad() # if using default pytorch SGD or Adam
			# self.opt.optimizer.zero_grad() # if using NoamOpt class

		if list(loss.data.size()) != []:
			return loss.data[0] * norm
		else:
			return loss.data * norm


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('torch.cuda.is_available():', torch.cuda.is_available())
print('Device:', device)

torch.cuda.is_available(): True
Device: cuda


In [12]:
V = voc_w2.num_words
model = make_model(V, N=12).to(device)

In [26]:
def run_epoch(data_iterator, model, loss_compute):
	"""Standard Training and Logging Function"""
	start = time.time()
	total_tokens = 0
	total_loss = 0
	tokens = 0

	loss_record = []
 
	for i, batch in enumerate(data_iterator):
		data = batch[0].T.to(device)
		trg = batch[1].T.to(device)
		mask = make_std_mask(trg, pad=0).to(device)
		out = model.forward(data, mask)
		ntokens = (trg != 0).data.sum()
		loss = loss_compute(out, trg, ntokens)
		loss_record.append(loss)
		total_loss += loss
		total_tokens += ntokens 
		tokens += ntokens
		if i % 50 == 1:
			elapsed = time.time() - start
			print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
					(i, loss / ntokens, tokens / elapsed))
			start = time.time()
			tokens = 0
	# return total_loss / total_tokens
	# print(len(loss_record))
	return np.array(loss_record)

In [32]:
import pickle 

with open("vocab/wikitext2.pkl", 'wb') as outp:
  pickle.dump(voc_w2, outp, pickle.HIGHEST_PROTOCOL)

In [34]:
def greedy_decode(model, max_len, symbol_list):
	ys = torch.Tensor([symbol_list]).long().to(device)
	for i in range(max_len-1):
		out = model.forward(ys, subsequent_mask(ys.size(1)).to(device))
		prob = model.generator(out[:, -1])
		_, next_word = torch.max(prob, dim = 1)
		next_word = next_word.data[0]
		ys = torch.cat([ys, 
						torch.ones(1, 1).long().fill_(next_word).to(device)], dim=1)
		if next_word == voc_w2.to_index('<EOS>'):
				break
	print([voc_w2.to_word(index.item()) for index in ys[0]])
	return ys

In [None]:
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)

## testing with different optimizers
model_opt = torch.optim.SGD(model.parameters(), lr=0.1)
# model_opt = torch.optim.Adam(model.parameters(), lr=1e-4)

# ## uses pytorch's Adam optimizer
# model_opt = NoamOpt(model.embed[0].d_model, 1, 4000,
# 		torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

train_iterator, valid_iterator, test_iterator = create_iterators()
training_loss = np.zeros(0)
validation_loss = np.zeros(0)

for epoch in range(100):
  model.train()
  tl = run_epoch(train_iterator, model, SimpleLossCompute(model.generator, criterion, model_opt))
  training_loss = np.concatenate([training_loss, tl])
  model.eval() 
  vl = run_epoch(valid_iterator, model, SimpleLossCompute(model.generator, criterion, None))
  validation_loss = np.concatenate([validation_loss, vl])

  if epoch % 10 == 0:
    torch.save(model.state_dict(), "models/wikietext2_SGD_{0}epochs.pt".format(epoch))
    sentence_start = ['<BOS>', 'the', 'dog', 'ran']
    symbol_list = [voc_w2.to_index(token) for token in sentence_start]
    print(greedy_decode(model, 30, symbol_list))




Epoch Step: 1 Loss: 26.998251 Tokens per Sec: 2961.576660
Epoch Step: 51 Loss: 6.078160 Tokens per Sec: 2862.364014
Epoch Step: 101 Loss: 5.978872 Tokens per Sec: 2858.023193
Epoch Step: 151 Loss: 5.900390 Tokens per Sec: 2870.828125
Epoch Step: 201 Loss: 5.913371 Tokens per Sec: 2863.014404
Epoch Step: 251 Loss: 5.538423 Tokens per Sec: 2839.856445
Epoch Step: 301 Loss: 6.158897 Tokens per Sec: 2889.676758
Epoch Step: 1 Loss: 5.876213 Tokens per Sec: 3593.694824
['<BOS>', 'the', 'dog', 'ran', ',', 'the', '<', 'unk', '>', 'of', 'the', '<', 'unk', '>', ',', 'the', '<', 'unk', '>', 'of', 'the', '<', 'unk', '>', 'of', 'the', '<', 'unk', '>', ',', 'the', '<', 'unk']
tensor([[   1,   21, 5135, 1969,   17,   21,   11,   12,   13,   20,   21,   11,
           12,   13,   17,   21,   11,   12,   13,   20,   21,   11,   12,   13,
           20,   21,   11,   12,   13,   17,   21,   11,   12]],
       device='cuda:0')
Epoch Step: 1 Loss: 14.031898 Tokens per Sec: 3243.184326
Epoch Step: 51 Loss:

In [None]:
model.eval()
sentence_start = ['<BOS>', 'the', 'dog', 'ran']
symbol_list = [voc_w2.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)