Mostly re-using Nava's code to

1.   Download data from arxiv
2.   Tokenize using spacy
3.   Build data batches using pytorch

In [1]:
IN_COLAB = 'google.colab' in str(get_ipython())
TRAIN = True

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    %cd /content/drive/MyDrive/Documents/HLML/abstracts/local_tz/

Mounted at /content/drive
/content/drive/MyDrive/Documents/HLML/abstracts/local_tz


**Downloading data from arxiv**

In [4]:
!pip install feedparser

import urllib.request
import feedparser
import pandas as pd

Collecting feedparser
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[?25l[K     |████                            | 10 kB 22.1 MB/s eta 0:00:01[K     |████████                        | 20 kB 28.5 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 16.5 MB/s eta 0:00:01[K     |████████████████▏               | 40 kB 12.0 MB/s eta 0:00:01[K     |████████████████████▏           | 51 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████▎       | 61 kB 5.9 MB/s eta 0:00:01[K     |████████████████████████████▎   | 71 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████████████| 81 kB 3.8 MB/s 
[?25hCollecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6065 sha256=031645bb7ec30c3a3d77f48d1f26b22329a5eaeb7fff51e8e985de4adb710262
  Stored in directory

**Retrieving and saving data from Arxiv API:**

Collecting all entries which contain the keyword 'system', and saving titles, abstracts, authors, category and tags.

Run this cell if the pickled DataFrame does not exist:

In [None]:
base_url = 'http://export.arxiv.org/api/query?';

search_query = 'all:system' 
col=['title', 'summary', 'authors', 'arxiv_primary_category', 'tags']

# if we ask for too many entries at once, we won't be guaranteed to get all of them
# hence we need to iterate through several pages of results to build up our dataset

master_df = pd.DataFrame(columns=col)
counter = 0

while counter < 10**4:

  start = counter                  
  max_results = 200

  query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                      start,
                                                      max_results)
  response = urllib.request.urlopen(base_url+query).read()
  feed = feedparser.parse(response)

  data_dict = {}
  for c in col:
    abstract_list=[]
    for entry in feed.entries:
      abstract_list.append(entry.get(c))
    data_dict[c] = abstract_list

  data_df = pd.DataFrame(data_dict,columns=col)
  counter += len(feed.entries)

  if master_df.empty:
    master_df = data_df
  else:
    master_df = master_df.append(data_df, ignore_index=True)

print(master_df)
master_df.to_pickle("datasets/arxiv_system_10000.pkl")

                                                  title  ...                                               tags
0     Compact integral manifolds of differential sys...  ...  [{'term': 'math.DS', 'scheme': 'http://arxiv.o...
1          Morphisms of Networks of Hybrid Open Systems  ...  [{'term': 'math.DS', 'scheme': 'http://arxiv.o...
2     First integrals of ordinary linear differentia...  ...  [{'term': 'math.DS', 'scheme': 'http://arxiv.o...
3     Complex Systems + Systems Engineering = Comple...  ...  [{'term': 'cs.MA', 'scheme': 'http://arxiv.org...
4            Systems of quotients of Lie triple systems  ...  [{'term': 'math.RA', 'scheme': 'http://arxiv.o...
...                                                 ...  ...                                                ...
9995  Simulations of fluctuations of quantum statist...  ...  [{'term': 'cond-mat.stat-mech', 'scheme': 'htt...
9996  Measurements of the Yield Stress in Frictionle...  ...  [{'term': 'cond-mat.soft', 'scheme': 'http

This cell loads the DataFrame:

In [5]:
master_df = pd.read_pickle("datasets/arxiv_system_10000.pkl")

**Tokenize using spacy**

In [6]:
!python -m spacy download en_core_web_lg

import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.1 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=4ed04397e66fe7b28ad2d6e9765771740e2afaff1cce7b140a375d15cb392cab
  Stored in directory: /tmp/pip-ephem-wheel-cache-kq34tmlc/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [9]:
titles = master_df.title.str.replace(r'\n', ' ')
titles = titles.str.replace('  ', '')
doc=[nlp.tokenizer(text.lower().strip()) for text in titles]

In [7]:
## manually constructing vocabulary
class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    BOS_token = 1   # Beginning-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        self.num_sentences = 0
        self.longest_sentence = 0

        ## add PAD, BOS, EOS tokens:
        self.word2index['<PAD>'] = self.num_words
        self.word2count['<PAD>'] = 1
        self.index2word[self.num_words] = '<PAD>'
        self.num_words += 1

        self.word2index['<BOS>'] = self.num_words
        self.word2count['<BOS>'] = 1
        self.index2word[self.num_words] = '<BOS>'
        self.num_words += 1

        self.word2index['<EOS>'] = self.num_words
        self.word2count['<EOS>'] = 1
        self.index2word[self.num_words] = '<EOS>'
        self.num_words += 1

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 0 
        for word in [token.text for token in sentence]:
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            self.longest_sentence = sentence_len
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [10]:
voc=Vocabulary('abstracts')
for sent in doc:
 	voc.add_sentence(sent)

In [11]:
Input_list=[]
for sample in range(len(doc)):
	Input_list.append([voc.to_index("<BOS>")]+[voc.to_index(token.text) for token in doc[sample]]+[voc.to_index("<EOS>")])

**Building datasets**

In [12]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence

class CustomTextDataset(Dataset):
  def __init__(self, data):
    self.data = data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]

arxiv_dataset = CustomTextDataset(Input_list)
data_len = len(Input_list)
print(data_len)
train_list, validation_list, test_list = random_split(arxiv_dataset, [int(data_len*0.72), int(data_len*0.1), data_len-int(data_len*0.82)], generator=torch.Generator().manual_seed(42))

10000


In [13]:
def collate_batch(batch):
    label_list, text_list = [], []
    for _sample in batch:
        label_list.append(torch.tensor(_sample[:-1])) # data
        text_list.append(torch.tensor(_sample[1:])) # trg
    return pad_sequence(label_list, padding_value=0.0), pad_sequence(text_list, padding_value=0.0)

batch_size = 30

def create_iterators(batch_size=batch_size):
    """Heler function to create the iterators"""
    dataloaders = []
    for split in [train_list, validation_list, test_list]:
        dataloader = DataLoader(
            split, batch_size=batch_size,
            collate_fn=collate_batch
            )
        dataloaders.append(dataloader)
    return dataloaders

**Make model and train**

In [15]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from misc_functions import attention, make_std_mask
from gpt_model import *
import math, copy, time

In [16]:
def make_model(vocab, N=12, 
			   d_model=512, d_ff=2048, h=8, dropout=0.1):
	"""Helper: Construct a model from hyperparameters."""

	## returns EncoderDecoder object
	c = copy.deepcopy
	attn = MultiHeadedAttention(h, d_model)
	ff = PositionwiseFeedForward(d_model, d_ff, dropout)
	position = PositionalEncoding(d_model, dropout)
	model = GPT(Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N),
		## Sequential passes input to the forward() method in the first module it stores
		## and then "chains" outputs to inputs sequentially for subsequent modules,
		nn.Sequential(Embeddings(d_model, vocab), c(position)),
		Generator(d_model, vocab))
	
	# This was important from their code. 
	# Initialize parameters with Glorot / fan_avg.
	for p in model.parameters():
		if p.dim() > 1:
			nn.init.xavier_uniform_(p) # what does this do? How does it modify model?
	return model

Optimizer, loss function

In [17]:
class NoamOpt:
	#"Optim wrapper that implements rate."
	def __init__(self, model_size, factor, warmup, optimizer):
		self.optimizer = optimizer
		self._step = 0
		self.warmup = warmup
		self.factor = factor
		self.model_size = model_size
		self._rate = 0
		
	def step(self):
		# "Update parameters and rate"
		self._step += 1
		rate = self.rate()
		for p in self.optimizer.param_groups:
			p['lr'] = rate
		self._rate = rate
		self.optimizer.step()
		
	def rate(self, step = None):
		# "Implement `lrate` above"
		if step is None:
			step = self._step
		return self.factor * \
			(self.model_size ** (-0.5) *
			min(step ** (-0.5), step * self.warmup ** (-1.5)))


class LabelSmoothing(nn.Module):
	# "Implement label smoothing."
	def __init__(self, size, padding_idx, smoothing=0.0):
		super(LabelSmoothing, self).__init__()
		self.criterion = nn.KLDivLoss(size_average=False) # Kullback-Leibler divergence loss
		self.padding_idx = padding_idx
		self.confidence = 1.0 - smoothing
		self.smoothing = smoothing
		self.size = size
		self.true_dist = None
		
	def forward(self, x, target):
		assert x.size(1) == self.size
		true_dist = x.data.clone()
		true_dist.fill_(self.smoothing / (self.size - 2))
		true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
		true_dist[:, self.padding_idx] = 0
		mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
		if mask.dim() > 0:
			true_dist.index_fill_(0, mask.squeeze(), 0.0)
		self.true_dist = true_dist.requires_grad_(False)

		return self.criterion(x, true_dist)
  
  
class SimpleLossCompute:
	# "A simple loss compute and train function."
	def __init__(self, generator, criterion, opt=None):
		self.generator = generator
		self.criterion = criterion # LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
		self.opt = opt # NoamOpt(model.src_embed[0].d_model, 1, 400, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
		
	def __call__(self, x, y, norm):
		x = self.generator(x) # x is output, each element now in d_vocab dimensions, shape = [30, 9, 11]
							  # y is batch.trg_y (first column of 1s removed), shape = [30, 9]
							  # norm is batch.ntokens (270)
		
		loss = self.criterion(x.contiguous().view(-1, x.size(-1)), # shape = [270, 11]
							  y.contiguous().view(-1)) / norm # shape = [270]
		# print("Label Smoothing called")
		loss.backward()
		if self.opt is not None:
			self.opt.step()
			self.opt.optimizer.zero_grad()

		if list(loss.data.size()) != []:
			return loss.data[0] * norm
		else:
			return loss.data * norm


Make model

In [None]:
V = voc.num_words
model = make_model(V, N=12)



In [18]:
def run_epoch(data_iterator, model, loss_compute):
	"""Standard Training and Logging Function"""
	start = time.time()
	total_tokens = 0
	total_loss = 0
	tokens = 0
 
	for i, batch in enumerate(data_iterator):
		data = batch[0].T
		trg = batch[1].T
		mask = make_std_mask(trg, pad=0)
		out = model.forward(data, mask)
		ntokens = (trg != 0).data.sum()
		loss = loss_compute(out, trg, ntokens)
		total_loss += loss
		total_tokens += ntokens 
		tokens += ntokens
		if i % 50 == 1:
			elapsed = time.time() - start
			print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
					(i, loss / ntokens, tokens / elapsed))
			start = time.time()
			tokens = 0
	return total_loss / total_tokens

In [None]:
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
## uses pytorch's Adam optimizer
model_opt = NoamOpt(model.embed[0].d_model, 1, 4000,
		torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

train_iterator, valid_iterator, test_iterator = create_iterators()

for epoch in range(10):
  model.train()
  run_epoch(train_iterator, model, SimpleLossCompute(model.generator, criterion, model_opt))
  model.eval() 
  run_epoch(valid_iterator, model, SimpleLossCompute(model.generator, criterion, None))




Epoch Step: 1 Loss: 2.026712 Tokens per Sec: 91.715179
Epoch Step: 51 Loss: 2.529584 Tokens per Sec: 99.184006
Epoch Step: 101 Loss: 2.204535 Tokens per Sec: 100.041481
Epoch Step: 151 Loss: 1.757971 Tokens per Sec: 101.556664
Epoch Step: 201 Loss: 1.203714 Tokens per Sec: 100.033089
Epoch Step: 1 Loss: 1.667969 Tokens per Sec: 101.408867
Epoch Step: 1 Loss: 1.967870 Tokens per Sec: 101.181755
Epoch Step: 51 Loss: 2.038388 Tokens per Sec: 99.432251
Epoch Step: 101 Loss: 1.587625 Tokens per Sec: 99.855103
Epoch Step: 151 Loss: 1.530269 Tokens per Sec: 101.193123
Epoch Step: 201 Loss: 1.156478 Tokens per Sec: 100.269638
Epoch Step: 1 Loss: 1.502966 Tokens per Sec: 102.106621
Epoch Step: 1 Loss: 1.928759 Tokens per Sec: 102.483704
Epoch Step: 51 Loss: 1.834937 Tokens per Sec: 100.301697
Epoch Step: 101 Loss: 1.393072 Tokens per Sec: 100.443863
Epoch Step: 151 Loss: 1.297270 Tokens per Sec: 101.833923
Epoch Step: 201 Loss: 1.098134 Tokens per Sec: 100.500153
Epoch Step: 1 Loss: 1.303021 To

**Saving the trained model (and vocab):**

In [None]:
import pickle 

with open("vocab/arxiv_system_titles_10000.pkl", 'wb') as outp:
  pickle.dump(voc, outp, pickle.HIGHEST_PROTOCOL)

torch.save(model.state_dict(), "models/titles_system_std_mask_10000_30epochs.pt")

Loading model (and vocab):

In [22]:
import pickle

with open("vocab/arxiv_system_titles_10000.pkl", 'rb') as inp:
    voc = pickle.load(inp)

V = voc.num_words
model = make_model(V, N=12)
model.load_state_dict(torch.load("models/titles_system_std_mask_10000_30epochs.pt"))

<All keys matched successfully>

In [26]:
print(V)

9752


Testing:

In [20]:
def greedy_decode(model, max_len, symbol_list):
	ys = torch.Tensor([symbol_list]).long()
	for i in range(max_len-1):
		out = model.forward(ys, subsequent_mask(ys.size(1)))
		prob = model.generator(out[:, -1])
		_, next_word = torch.max(prob, dim = 1)
		next_word = next_word.data[0]
		ys = torch.cat([ys, 
						torch.ones(1, 1).long().fill_(next_word)], dim=1)
		if next_word == voc.to_index('<EOS>'):
				break
	print([voc.to_word(index.item()) for index in ys[0]])
	return ys

In [23]:
model.eval()
sentence_start = ['<BOS>', 'an' , 'understanding']
symbol_list = [voc.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)

['<BOS>', 'an', 'understanding', '-', 'complex', '-', 'the', 'complexity', 'of', 'the', 'immune', 'system', ':', 'a', 'complex', '-', 'theoretic', 'approach', 'to', 'parametric', 'complexity', '<EOS>']


In [None]:
sentence_start = ['<BOS>', 'fluctuations' , 'of']
symbol_list = [voc.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)

['<BOS>', 'fluctuations', 'of', 'quantum', 'statistical', 'two', '-', 'dimensional', 'systems', 'of', 'electrons', '<EOS>']


In [None]:
sentence_start = ['<BOS>','understanding']
symbol_list = [voc.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)

['<BOS>', 'understanding', 'the', 'influence', 'of', 'individual', "'s", 'self', '-', 'efficacy', 'for', 'information', 'systems', 'security', 'innovation', 'adoption', ':', 'a', 'systematic', 'literature', 'review', '<EOS>']


In [None]:
sentence_start = ['<BOS>', 'the' , 'mechanism']
symbol_list = [voc.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)

['<BOS>', 'the', 'mechanism', 'of', 'scale', '-', 'invariance', '<EOS>']


In [None]:
sentence_start = ['<BOS>', 'the' , 'analog']
symbol_list = [voc.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)

['<BOS>', 'the', 'analog', 'of', 'the', 't', 'system', ':', 'a', 'massive', 'component', 'sharing', 'for', 'the', 'voxceleb', 'learning', '<EOS>']


In [28]:
sentence_start = ['<BOS>', 'open']
symbol_list = [voc.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)
sentence_start = ['<BOS>', '$']
symbol_list = [voc.to_index(token) for token in sentence_start]
output = greedy_decode(model, 30, symbol_list)

['<BOS>', 'open', 'quantum', 'system', 'identification', '<EOS>']
['<BOS>', '$', '\\text{h}_{\\infty}$', 'tracking', 'control', 'via', 'variable', 'gain', 'gradient', 'descent', '-', 'based', 'integral', 'reinforcement', 'learning', 'for', 'unknown', 'continuous', 'time', 'nonlinear', 'system', '<EOS>']
