Mostly re-using Nava's code to

1.   Download data from arxiv
2.   Tokenize using spacy
3.   Build data batches using pytorch

**Downloading data from arxiv**

In [1]:
!pip install feedparser

import urllib.request
import feedparser
import pandas as pd

Collecting feedparser
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/15bf6781a861bbc5dd801d467f26448fb322bfedcd30f2e62b148d104dfb/feedparser-6.0.8-py3-none-any.whl (81kB)
[K     |████                            | 10kB 16.0MB/s eta 0:00:01[K     |████████                        | 20kB 8.5MB/s eta 0:00:01[K     |████████████▏                   | 30kB 7.5MB/s eta 0:00:01[K     |████████████████▏               | 40kB 7.1MB/s eta 0:00:01[K     |████████████████████▏           | 51kB 4.2MB/s eta 0:00:01[K     |████████████████████████▎       | 61kB 4.4MB/s eta 0:00:01[K     |████████████████████████████▎   | 71kB 4.8MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 3.5MB/s 
[?25hCollecting sgmllib3k
  Downloading https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[

In [2]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters
search_query = 'all:electron' # search for electron in all fields
start = 0                     # retreive the first 5 results
max_results = 10**3

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()

# parse the response using feedparser
feed = feedparser.parse(response)

#columns of interest
col=['title', 'summary', 'authors', 'arxiv_primary_category', 'tags']

# Run through each entry, and fill the information into a list
data_list=[]
for c in col:
	abstract_list=[]
	for entry in feed.entries:
		abstract_list.append(entry.get(c))
	data_list.append(abstract_list)
 
 # convert into a panda dataframe 
data_df = pd.DataFrame(data_list,index=col)
data_df=data_df.T

**Tokenize using spacy**

In [3]:
!python -m spacy download en_core_web_lg

import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.3MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp37-none-any.whl size=829180945 sha256=274b5d5bfdc4192420a97a4c07f222024658f60709271fbda4845fe0696e5813
  Stored in directory: /tmp/pip-ephem-wheel-cache-shnk_5wo/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [4]:
#taking titles
# using lower case. removing extra spaces and '\n ' 
doc=[nlp.tokenizer(text.lower().replace('\n ','').strip()) for text in data_list[0]]

In [42]:
## manually constructing vocabulary
class Vocabulary:
    PAD_token = 0   # Used for padding short sentences
    BOS_token = 1   # Beginning-of-sentence token
    EOS_token = 2   # End-of-sentence token

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.num_words = 0
        self.num_sentences = 0
        self.longest_sentence = 0

        ## add PAD, BOS, EOS tokens:
        self.word2index['<PAD>'] = self.num_words
        self.word2count['<PAD>'] = 1
        self.index2word[self.num_words] = '<PAD>'
        self.num_words += 1

        self.word2index['<BOS>'] = self.num_words
        self.word2count['<BOS>'] = 1
        self.index2word[self.num_words] = '<BOS>'
        self.num_words += 1

        self.word2index['<EOS>'] = self.num_words
        self.word2count['<EOS>'] = 1
        self.index2word[self.num_words] = '<EOS>'
        self.num_words += 1

    def add_word(self, word):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            
    def add_sentence(self, sentence):
        sentence_len = 1 # length of sentence + <EOS> or <BOS>
        for word in [token.text for token in sentence]:
            sentence_len += 1
            self.add_word(word)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [43]:
voc=Vocabulary('abstracts')
for sent in doc:
 	voc.add_sentence(sent)
  
Input_list=[]
for sample in range(len(doc)):
	Input_list.append([voc.to_index("<BOS>")]+[voc.to_index(token.text) for token in doc[sample]]+[voc.to_index("<EOS>")])
Output_list=Input_list;
Input_Output_Data_list=[Input_list,Output_list]


**Building datasets**

In [44]:
from sklearn.model_selection import train_test_split

#10% test set
In_train, In_test, Out_train, Out_test = train_test_split(Input_list, Output_list, test_size=0.1, random_state=1)

#from 90% train set --> 20% validation and 80 % training (= in total we have 10% test, 18% val, 72% train )
In_train, In_val, Out_train, Out_val = train_test_split(In_train, Out_train , test_size=0.2, random_state=1)

train_list=  In_train
label=Out_train
validation_list=In_val
test_list=In_test

In [45]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    label_list, text_list = [], []
    for _sample in batch:
        label_list.append(torch.tensor(_sample[:-1])) # shift stuff here
        text_list.append(torch.tensor(_sample[1:]))
    return pad_sequence(label_list, padding_value=0.0), pad_sequence(text_list, padding_value=0.0)

batch_size = 30

def create_iterators(batch_size=batch_size):
    """Heler function to create the iterators"""
    dataloaders = []
    for split in [train_list, validation_list, test_list]:
        dataloader = DataLoader(
            split, batch_size=batch_size,
            collate_fn=collate_batch
            )
        dataloaders.append(dataloader)
    return dataloaders

train_iterator, valid_iterator, test_iterator = create_iterators()
for i, batch in enumerate(train_iterator):
  if i < 5:
    # print("0th element: ",batch[0])
    # print([index for index in batch[0]])
    print(batch[0].shape)
    print("0th element: ",[voc.to_word(index.item()) for index in batch[0].T[0]])
    print("1st element: ",[voc.to_word(index.item()) for index in batch[1].T[0]])
  else:
    break

torch.Size([26, 30])
0th element:  ['<BOS>', 'observation', 'of', 'electron', '-', 'hole', 'puddles', 'in', 'graphene', 'using', 'a', 'scanning', 'single', 'electron', 'transistor', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
1st element:  ['observation', 'of', 'electron', '-', 'hole', 'puddles', 'in', 'graphene', 'using', 'a', 'scanning', 'single', 'electron', 'transistor', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
torch.Size([26, 30])
0th element:  ['<BOS>', 'escape', 'of', 'trapped', 'electrons', 'from', 'a', 'helium', 'surface', ':', 'a', 'dynamical', 'theory', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
1st element:  ['escape', 'of', 'trapped', 'electrons', 'from', 'a', 'helium', 'surface', ':', 'a', 'dynamical', 'theory', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD

**Make model and train**

In [46]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from misc_functions import attention, subsequent_mask
from gpt_model import *
import math, copy, time

In [47]:
def make_model(vocab, N=12, 
			   d_model=512, d_ff=2048, h=8, dropout=0.1):
	"""Helper: Construct a model from hyperparameters."""

	## returns EncoderDecoder object
	c = copy.deepcopy
	attn = MultiHeadedAttention(h, d_model)
	ff = PositionwiseFeedForward(d_model, d_ff, dropout)
	position = PositionalEncoding(d_model, dropout)
	model = GPT(Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N),
		## Sequential passes input to the forward() method in the first module it stores
		## and then "chains" outputs to inputs sequentially for subsequent modules,
		nn.Sequential(Embeddings(d_model, vocab), c(position)),
		Generator(d_model, vocab))
	
	# This was important from their code. 
	# Initialize parameters with Glorot / fan_avg.
	for p in model.parameters():
		if p.dim() > 1:
			nn.init.xavier_uniform_(p) # what does this do? How does it modify model?
	return model

Optimizer, loss function

In [48]:
class NoamOpt:
	#"Optim wrapper that implements rate."
	def __init__(self, model_size, factor, warmup, optimizer):
		self.optimizer = optimizer
		self._step = 0
		self.warmup = warmup
		self.factor = factor
		self.model_size = model_size
		self._rate = 0
		
	def step(self):
		# "Update parameters and rate"
		self._step += 1
		rate = self.rate()
		for p in self.optimizer.param_groups:
			p['lr'] = rate
		self._rate = rate
		self.optimizer.step()
		
	def rate(self, step = None):
		# "Implement `lrate` above"
		if step is None:
			step = self._step
		return self.factor * \
			(self.model_size ** (-0.5) *
			min(step ** (-0.5), step * self.warmup ** (-1.5)))


class LabelSmoothing(nn.Module):
	# "Implement label smoothing."
	def __init__(self, size, padding_idx, smoothing=0.0):
		super(LabelSmoothing, self).__init__()
		self.criterion = nn.KLDivLoss(size_average=False) # Kullback-Leibler divergence loss
		self.padding_idx = padding_idx
		self.confidence = 1.0 - smoothing
		self.smoothing = smoothing
		self.size = size
		self.true_dist = None
		
	def forward(self, x, target):
		assert x.size(1) == self.size
		true_dist = x.data.clone()
		true_dist.fill_(self.smoothing / (self.size - 2))
		true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
		true_dist[:, self.padding_idx] = 0
		mask = torch.nonzero(target.data == self.padding_idx, as_tuple=False)
		if mask.dim() > 0:
			true_dist.index_fill_(0, mask.squeeze(), 0.0)
		self.true_dist = true_dist.requires_grad_(False)
		# return self.criterion(x, Variable(true_dist, requires_grad=False))
		return self.criterion(x, true_dist)
  
  
class SimpleLossCompute:
	# "A simple loss compute and train function."
	def __init__(self, generator, criterion, opt=None):
		self.generator = generator
		self.criterion = criterion # LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
		self.opt = opt # NoamOpt(model.src_embed[0].d_model, 1, 400, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
		
	def __call__(self, x, y, norm):
		x = self.generator(x) # x is output, each element now in d_vocab dimensions, shape = [30, 9, 11]
							  # y is batch.trg_y (first column of 1s removed), shape = [30, 9]
							  # norm is batch.ntokens (270)

		loss = self.criterion(x.contiguous().view(-1, x.size(-1)), # shape = [270, 11]
							  y.contiguous().view(-1)) / norm # shape = [270]
		loss.backward() # compute gradients (of what?)
		if self.opt is not None:
			self.opt.step()
			self.opt.optimizer.zero_grad()

		if list(loss.data.size()) != []:
			return loss.data[0] * norm
		else:
			return loss.data * norm


Make model

In [49]:
device = 'gpu'
V = voc.num_words
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, N=12).to(device)
## uses pytorch's Adam optimizer
model_opt = NoamOpt(model.embed[0].d_model, 1, 400,
		torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))



In [51]:
train_data=In_train

def run_epoch(model, loss_compute):
	"""Standard Training and Logging Function"""
	start = time.time()
	total_tokens = 0
	total_loss = 0
	tokens = 0
	train_iterator, valid_iterator, test_iterator = create_iterators()
	for batch, i in enumerate(range(0, len(train_data) - 1)):
		data, targets=next(iter(train_iterator)) # change both data and target
		mask = subsequent_mask(data.size(0)).to(device)
		out = model.forward(data.T, mask)
		loss = loss_compute(out, targets.T, V)
		total_loss += loss
		total_tokens += V
		tokens += V
		if i % 50 == 1:
			elapsed = time.time() - start
			print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
					(i, loss / V, tokens / elapsed))
			start = time.time()
			tokens = 0
	return total_loss / total_tokens

In [52]:

model.train() ## calls nn.Module.train() which sets mode to train
run_epoch(model, # generates 20 batches of [30, 10] random integers (first column is 1)
      SimpleLossCompute(model.generator, criterion, model_opt))
# model.eval() ## sets mode to testing (i.e. train=False). Layers like dropout behave differently depending on if mode is train or testing.
# run_epoch(model, 
#         SimpleLossCompute(model.generator, criterion, None))


Epoch Step: 1 Loss: 1.622465 Tokens per Sec: 368.875506
Epoch Step: 51 Loss: 0.201556 Tokens per Sec: 488.866957
Epoch Step: 101 Loss: 0.051840 Tokens per Sec: 489.427552
Epoch Step: 151 Loss: 0.054618 Tokens per Sec: 534.214603
Epoch Step: 201 Loss: 0.051482 Tokens per Sec: 534.717503
Epoch Step: 251 Loss: 0.050731 Tokens per Sec: 527.073128
Epoch Step: 301 Loss: 0.050033 Tokens per Sec: 536.063833
Epoch Step: 351 Loss: 0.050814 Tokens per Sec: 536.271535
Epoch Step: 401 Loss: 0.049945 Tokens per Sec: 531.607598
Epoch Step: 451 Loss: 0.052882 Tokens per Sec: 525.717861
Epoch Step: 501 Loss: 0.056537 Tokens per Sec: 514.674348
Epoch Step: 551 Loss: 0.051117 Tokens per Sec: 508.770060
Epoch Step: 601 Loss: 0.052809 Tokens per Sec: 488.726225
Epoch Step: 651 Loss: 0.055966 Tokens per Sec: 447.902482
Epoch Step: 701 Loss: 0.051520 Tokens per Sec: 410.974630


tensor(0.1246)

In [64]:
def greedy_decode(model, max_len, start_symbol):
	ys = torch.ones(1, 1).fill_(start_symbol).long()
	for i in range(max_len-1):
		out = model.forward(ys, subsequent_mask(ys.size(1)))
		prob = model.generator(out[:, -1])
		_, next_word = torch.max(prob, dim = 1)
		# print(next_word)
		next_word = next_word.data[0]
		# print(voc.to_word(next_word.item()))
		ys = torch.cat([ys, 
						torch.ones(1, 1).long().fill_(next_word)], dim=1)
	# print(ys)
	print([voc.to_word(index.item()) for index in ys[0]])
	return ys

In [65]:
greedy_decode(model, 20, 1)

['<BOS>', 'charge', 'occupancy', 'of', 'two', 'interacting', 'electrons', 'on', 'artificial', 'molecules', '-', 'exact', 'results', '<EOS>', 'results', '<EOS>', 'ion', 'interaction', 'and', 'in']


tensor([[   1,  429, 1138,    4,   44,  486,   19,    8, 1139,  630,    6,   57,
          558,    2,  558,    2,  153,   10,   28,   33]])

In [66]:
greedy_decode(model, 20, 6)

['-', 'singular', 'spectral', 'features', 'for', 'the', 'interacting', 'vortex', 'structures', 'in', 'thermostat', 'regime', '<EOS>', 'dimensional', 'electron', 'gas', '<EOS>', '<EOS>', 'ion', 'interaction']


tensor([[   6, 1195,  112, 1196,   98,   34,  486,  260,  142,   33, 1407,  300,
            2,   45,    5,  144,    2,    2,  153,   10]])