In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import json
import pickle
import dill


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

## kaggle 
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
        
    

import os, psutil  

def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    return 'memory GB:' + str(np.round(memory_use, 2))

In [2]:
# !pip install pytorch_lightning

## **Guidelines to follow before working on this**
- Comment your code well
- See **to do** and update after doing something (at the end)
- Write readable code
- Save stats after performing the experiment (training)
- Hyperparameters must be same throughout the whole training/experiment.

## Done
Dataset is ready to feed to pytorch models.

## **To Do**
- Create Recurrent Seq2Seq model(baseline) 
- Training/Testing with metrics like ppl and blue score (not limited to), one can use other metrics as well but these two must be there. Also don't forget to add loggers and checkpoints (Tensorboard, EarlyStopping and ModelCheckpoint are must rest are optional)
- Start from **Run from here after import**
- Update the below **Results** table



## **Result**

|   Model Name  |    Train_Loss |      Train_PPL |    Train_BLEU |       Val_Loss |       Val_PPL | Val_BLEU     |
| ------------- | ------------- |  ------------- | ------------- |  ------------- | ------------- |------------- |
| Content Cell  | Content Cell  |  Content Cell  | Content Cell  |  Content Cell  | Content Cell  |Content Cell  |
| Content Cell  | Content Cell  |  Content Cell  | Content Cell  |  Content Cell  | Content Cell  |Content Cell  |


In [3]:
import torch
from torchtext.data import Field, TabularDataset, BucketIterator, utils, Dataset

In [4]:
# reproducibility
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cuda', index=0)

In [7]:
cpu_stats()

'memory GB:0.22'

> ## Data Preparation

In [8]:
path = "./dataset/arxiv-metadata-oai-snapshot.json"

In [9]:
def get_metadata():
    with open(path, 'r') as f:
        for line in f:
            yield line

In [10]:
metadata = get_metadata()
for paper in metadata:
    paper_dict = json.loads(paper)
    print(f'Title: {paper_dict.get("title")}\n\n Abstract: {paper_dict.get("abstract")}\n\n Ref: {paper_dict.get("journal-ref")}\n\n Cat: {paper_dict.get("categories")[0]}')
#     print(paper)
    break

Title: Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies

 Abstract:   A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
th

#### Papers abstract, title and other infos from paper

In [18]:
titles = []
abstracts = []
years = []
cats = []

metadata = get_metadata()
for paper in metadata:
    paper_dict = json.loads(paper)
    ref = paper_dict.get('journal-ref')
    try:
        year = int(ref[-4:])
        if 2021 > year > 2000:
            years.append(year)
            titles.append(paper_dict.get('title'))
            abstracts.append(paper_dict.get('abstract'))
            cats.append(paper_dict.get("categories"))
    except:
        pass 

print(len(titles), len(abstracts), len(years), len(cats))

274255 274255 274255 274255


In [19]:
papers = pd.DataFrame({
    "title":titles,
    "abstract":abstracts,
    "year":years,
    "categories":cats
})

In [20]:
# head
papers.head()

Unnamed: 0,title,abstract,year,categories
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,2007,[hep-ph]
1,Polymer Quantum Mechanics and its Continuum Limit,A rather non-standard quantum representation...,2007,[gr-qc]
2,"The Spitzer c2d Survey of Large, Nearby, Inste...",We discuss the results from the combined IRA...,2007,[astro-ph]
3,Fermionic superstring loop amplitudes in the p...,The pure spinor formulation of the ten-dimen...,2007,[hep-th]
4,Lifetime of doubly charmed baryons,"In this work, we evaluate the lifetimes of t...",2008,[hep-ph]


In [21]:
### save the dataframe
papers.to_csv("papers.csv", index=False)

In [22]:
del papers

### Run from here after import
- [Download Preprocessed Data](https://www.dropbox.com/s/ta5z9ec3rc8bju8/preprocessed.zip?dl=0)

In [8]:
# create text fields, since both title and abstract are text only single can be used for both
field = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="en",
    batch_first=True
)
fields = [("title", field), ("abstract", field)]

In [9]:
## Don't do it if you're loading pre-processed data, read the data
# dataset = TabularDataset(path="papers.csv", format="CSV", fields=fields)

#### Save the data so that next time we need not to preprocess (that's a hack)

In [10]:
# # save the data so that next time we don't need preprocessing
# with open("field.pkl", "wb") as fp:
#     dill.dump(field, fp)
    
# with open("data.pkl", "wb") as fp:
#     dill.dump(dataset.examples, fp)

In [9]:
# load the data and 
with open("data.pkl", 'rb') as fp:
    examples = dill.load(fp)

In [10]:
# create dataset
dataset  = Dataset(examples=examples, fields=fields)

In [11]:
# train-val split
train, val = dataset.split(split_ratio=0.95)

In [12]:
# build vocabulary
field.build_vocab(train)

In [13]:
# vocab size
len(field.vocab)

264898

In [14]:
# train and validation iterator
BATCH_SIZE = 64
train_loader, val_loader = BucketIterator.splits(
    datasets=(train, val),
    batch_size=BATCH_SIZE,
    shuffle=True,
    sort=False,
#     device=device
)

In [16]:
batch = next(iter(train_loader))

In [17]:
print(batch.title.shape, batch.abstract.shape)

torch.Size([64, 28]) torch.Size([64, 372])


## Model
- See to do at the top

### 1. Seq2Seq Model

In [19]:
import torch.nn as nn
from tqdm import tqdm
# import pytorch_lightning as pl

In [20]:
vocab_size = len(field.vocab)
embedding_dim = 256
hidden_size = 512

In [21]:
class SequentialModel(nn.Module):
    """
        encoder module will encode the source text (abstract in this case) 
        into a(or many if it is multi-layer) contextualized tensor
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(SequentialModel, self).__init__()
        
        self.vocab_size = vocab_size,
        self.embedding_dim = embedding_dim
        
        # embedding layer to embed the tokens
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        
        # reccurent layer whose last hidden state will be context of whole sequence
        self.gru = nn.GRU(
            input_size = embedding_dim,
            hidden_size = hidden_size,
            batch_first=True,
        )
    
    
    def forward(self, x, hidden=None):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded, hidden)
        return outputs, hidden

In [22]:
class Seq2Seq(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_size, dropout=0.15):
        super(Seq2Seq, self).__init__()
        
        #encoder model to encoder the abstract
        self.encoder = SequentialModel(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_size=hidden_size)
        
        # decoder is a LM
        self.decoder = SequentialModel(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_size=hidden_size)
        
        self.dropout = nn.Dropout(p=dropout)
        self.fc_out = nn.Linear(in_features=hidden_size, out_features=vocab_size)
    
    def forward(self, src, trg):
        """
            src.shape -> [batch, src_len]
            trg.shape -> [batch, trg_len]
        """
        _, hidden = self.encoder(src)
        outputs, _ = self.decoder(trg, hidden)
        prediction = self.fc_out(self.dropout(outputs))
        
        return prediction

## Training

In [58]:
model =  Seq2Seq(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_size=hidden_size).to(device)

In [59]:
## optimizer and criterion
PAD_IDX = field.vocab.stoi[field.pad_token]
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [60]:
def eval(model, data_loader, crtierion):
    losses = []
    ppl = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            abstract, title = batch.abstract, batch.title
            batch_size, title_len = title.shape[0], title.shape[1]
            outputs = model(abstract.to(device), title.to(device))
            l = criterion(outputs.view(batch_size*title_len, -1), title.view(-1))
            losses.append(l.item())
            ppl.append(torch.exp(l).item())
            
    return sum(losses)/len(losses), sum(ppl)/len(ppl)
    

In [61]:
def train(model, data_loader, criterion, optimizer, EPOCHS=10):
    total_steps = len(data_loader)*EPOCHS
    steps = 0
    for epoch in tqdm(range(EPOCHS)):
        losses = []
        ppl = []
        step_progress = tqdm(total=len(train_loader), desc="Step", position=0)
        for batch in tqdm(data_loader):
            abstract, title = batch.abstract.to(device), batch.title.to(device)
            batch_size, title_len = title.shape[0], title.shape[1]
            outputs = model(abstract, title)
            
            loss = criterion(outputs.view(batch_size*title_len, -1), title.view(-1))
            
            # backpropgagte the loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            losses.append(loss.item())
            ppl.append(torch.exp(loss).item())
            
            if steps%1000==0:
                print(f'Steps {steps}/{total_steps} | Train_loss {loss.item():.3f} | Train_ppl {torch.exp(loss).item():.3f}')
            
            steps += 1
            step_progress.update(1)
        
        avg_loss = sum(losses)/len(losses)
        avg_ppl = sum(ppl)/len(ppl)
        
        val_loss, val_ppl = eval(model, val_loader, criterion)
        
        print(f'Epoch {epoch}/{EPOCHS} | Steps {steps}/{total_steps}\nTrain_loss {avg_loss:.3f} | Train_ppl {avg_ppl:.3f}\nVal_loss {val_loss:.3f} Val_ppl {val_ppl:.3f}') 
                    

In [63]:
train(model, train_loader, criterion, optimizer)