In [2]:
import urllib
from urllib import request
from bs4 import BeautifulSoup
import os
import requests
import argparse
import re
import time
import json
import math

import codecs
import urllib.parse as up

# Crawler and Semantic Scholar Information Extractor

In [3]:
# Generate a concatenated tldr string of papers from a certain query. Also return the information pack of these papers.
def SSSQuery(query, num_item=50, offset=0, fos=None):
    # Send a Semantic API post to get the result paper list
    query = '+'.join(query.split())
    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={query}&offset={offset}&limit={num_item}&fields=fieldsOfStudy,abstract'
    paper_list = str(request.urlopen(url).read(), 'utf-8')
    paper_list = json.loads(paper_list)

    tldr_cat = ''
    info_pack = []
    # Loop through the paper list and get information pack for each paper
    for paper in paper_list['data']:
        paper_id = paper['paperId']
        # Each paper info pack consists title, abstract, authors, tldr, citation count, field of study
        paper_url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=title,abstract,citations.authors,tldr,citationCount,fieldsOfStudy'
        paper_info = str(request.urlopen(paper_url).read(), 'utf-8')
        paper_info = json.loads(paper_info)
        # Filter out the papers out of field of study
        if fos is not None and fos not in paper_info['fieldsOfStudy']:
            continue
        info_pack.append(paper_info)
        tldr = paper_info['tldr']
        if tldr is not None:
            tldr_cat += ' '+tldr['text'].strip()
    tldr_cat = tldr_cat.strip()
    return tldr_cat, info_pack

In [4]:
tldr_cat, info_pack = SSSQuery('transformer', num_item=10, fos='Computer Science')
print([i['title'] for i in info_pack])

['Spatial Transformer Networks', 'Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer', 'Transformer-XL: Attentive Language Models beyond a Fixed-Length Context', 'Longformer: The Long-Document Transformer', 'Reformer: The Efficient Transformer', 'Conformer: Convolution-augmented Transformer for Speech Recognition', 'Heterogeneous Graph Transformer', 'AraBERT: Transformer-based Model for Arabic Language Understanding', 'Reducing Transformer Depth on Demand with Structured Dropout', 'Meshed-Memory Transformer for Image Captioning']


In [5]:
tldr_cat

'This work introduces a new learnable module, the Spatial Transformer, which explicitly allows the spatial manipulation of data within the network, and can be inserted into existing convolutional architectures, giving neural networks the ability to actively spatially transform feature maps. This systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks and achieves state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. This work proposes a novel neural architecture Transformer-XL that enables learning dependency beyond a fixed length without disrupting temporal coherence, which consists of a segment-level recurrence mechanism and a novel positional encoding scheme. Following prior work on long-sequence transformers, the Longformer is evaluated on character-level language modeling and achieves state-of-the-art result

# Keywords Extraction

In [6]:
language = "en"
max_ngram_size = 1
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 20

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
keywords = custom_kw_extractor.extract_keywords(tldr_cat)
keywords = sorted(keywords, key=lambda x:x[1])
for kw in keywords:
    print(kw)

('Spatial', 0.03213113441634914)
('work', 0.04798022059872212)
('language', 0.06973861700167353)
('Transformer', 0.07994217878795795)
('tasks', 0.08425923835568348)
('architectures', 0.12034394262796394)
('Longformer', 0.1231255301810972)
('neural', 0.13795581464104506)
('training', 0.13840506098154778)
('module', 0.14654904914113015)
('giving', 0.14654904914113015)
('maps', 0.14654904914113015)
('achieves', 0.1514314025909243)
('results', 0.1514314025909243)
('introduces', 0.15751851687783422)
('learnable', 0.15751851687783422)
('explicitly', 0.15751851687783422)
('manipulation', 0.15751851687783422)
('data', 0.15751851687783422)
('inserted', 0.15751851687783422)


In [8]:
!pip install transformers==3.4.0

Collecting transformers==3.4.0
  Downloading transformers-3.4.0-py3-none-any.whl (1.3 MB)
Collecting tokenizers==0.9.2
  Downloading tokenizers-0.9.2-cp36-cp36m-win_amd64.whl (1.9 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.3
    Uninstalling tokenizers-0.10.3:
      Successfully uninstalled tokenizers-0.10.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.12.5
    Uninstalling transformers-4.12.5:
      Successfully uninstalled transformers-4.12.5
Successfully installed tokenizers-0.9.2 transformers-3.4.0


ERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you use --use-feature=2020-resolver to test your packages with the new resolver before it becomes the default.

sentence-transformers 2.1.0 requires tokenizers>=0.10.3, but you'll have tokenizers 0.9.2 which is incompatible.
sentence-transformers 2.1.0 requires transformers<5.0.0,>=4.6.0, but you'll have transformers 3.4.0 which is incompatible.
flair 0.7 requires sentencepiece<=0.1.91, but you'll have sentencepiece 0.1.96 which is incompatible.
flair 0.7 requires transformers<=3.5.1,>=3.5.0, but you'll have transformers 3.4.0 which is incompatible.


In [9]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
Collecting hstspreload
  Downloading hstspreload-2021.11.1-py3-none-any.whl (1.3 MB)
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
Collecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
Collecting contextvars>=2.1; python_version < "3.7"
  Downloading contextvars-2.4.tar.gz (9.6 kB)
Collecting hpack<4,>=3.0
  Downloading hpack-3.0.0-py2.py3-none-any.whl (38 kB)
Collecting hyperframe<6,>=5.2.0
  Downloading hyperframe-5.2.0-py2.py3-none-any.whl (12 kB)
Collecting immutables>=0.9
  Downloading immutables-0.16-cp36-cp36m-win_amd64.whl (59 kB)
Building w

In [15]:
pip install celery==4.4.2


Collecting celery==4.4.2
  Downloading celery-4.4.2-py2.py3-none-any.whl (422 kB)
Collecting billiard<4.0,>=3.6.3.0
  Downloading billiard-3.6.4.0-py3-none-any.whl (89 kB)
Collecting pytz>dev
  Downloading pytz-2021.3-py2.py3-none-any.whl (503 kB)
Collecting vine==1.3.0
  Downloading vine-1.3.0-py2.py3-none-any.whl (14 kB)
Collecting kombu<4.7,>=4.6.8
  Downloading kombu-4.6.11-py2.py3-none-any.whl (184 kB)
Collecting amqp<2.7,>=2.6.0
  Downloading amqp-2.6.1-py2.py3-none-any.whl (48 kB)
Installing collected packages: billiard, pytz, vine, amqp, kombu, celery
Successfully installed amqp-2.6.1 billiard-3.6.4.0 celery-4.4.2 kombu-4.6.11 pytz-2021.3 vine-1.3.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
Note: you may need to restart the kernel to use updated packages.
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py): started
  Building wheel for seqeval (setup.py): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16175 sha256=fdc9b5b4982ca8a2e5f11506fd1729be7df2837efc779ee1cbc9a4b236a857dc
  Stored in directory: c:\users\minha\appdata\local\pip\cache\wheels\39\29\36\1c4f7905c133e11748ca375960154964082d4fb03478323089
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [10]:
pip install conllu

Collecting conllu
  Downloading conllu-4.4.1-py2.py3-none-any.whl (15 kB)
Installing collected packages: conllu
Successfully installed conllu-4.4.1
Note: you may need to restart the kernel to use updated packages.


In [44]:
from helpers import tokenize_and_format, flat_accuracy
import torch
import pandas as pd

df = pd.read_csv('transformer_citation.csv')
# df = pd.read_csv('tweets.csv')

df = df.sample(frac=1).reset_index(drop=True)

texts = df.text.values
labels = df.citationCount.values

labels =labels/labels.max()

# print(labels)
### tokenize_and_format() is a helper function provided in helpers.py ###
input_ids, attention_masks = tokenize_and_format(texts)

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.FloatTensor(labels)
# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

[0.03500243 0.04180846 0.01458435 0.0405931  0.01871658 0.01871658
 0.01895965 0.07778318 0.02479339 0.02236266 0.0405931  0.02600875
 0.01944579 0.03475936 0.01093826 0.02236266 0.0449684  0.05833738
 0.0140982  0.01652893 0.55104521 0.03014098 0.03014098 0.03597472
 0.08653379 0.07267866 0.01944579 0.02309188 0.04302382 0.04083617
 0.04423918 0.01263977 0.02698104 0.03719008 0.033544   1.
 0.00947982 0.02625182 0.03427321 0.09577054 0.01336898 0.02479339
 0.02236266 0.01628585 0.0308702  0.06976179 0.06854643 0.04545455
 0.01507049 0.03257171 0.02139037 0.0281964  0.11983471 0.04278075
 0.03913466 0.01482742 0.05614973 0.05323286 0.34540593 0.01507049
 0.03159942 0.06222654 0.10841031 0.0449684  0.01385513 0.02260574
 0.01507049 0.12955761 0.01069519]
Original:  Levenshtein Transformer is developed, a new partially autoregressive model devised for more flexible and amenable sequence generation and a set of new training techniques dedicated at them, effectively exploiting one as the o

In [45]:

total = len(df)

num_train = int(total * .6)
num_val = int(total * .2)
num_test = total - num_train - num_val

# make lists of 3-tuples (already shuffled the dataframe in cell above)

train_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train)]
val_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_train, num_val+num_train)]
test_set = [(input_ids[i], attention_masks[i], labels[i]) for i in range(num_val + num_train, total)]

train_text = [texts[i] for i in range(num_train)]
val_text = [texts[i] for i in range(num_train, num_val+num_train)]
test_text = [texts[i] for i in range(num_val + num_train, total)]

In [46]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 1, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
device = torch.device("cpu")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [47]:
batch_size = 16
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8, # args.adam_epsilon  - default is 1e-8
                  weight_decay = 0.01
                )
epochs = 10





In [57]:
import numpy as np
# function to get validation accuracy
def get_validation_performance(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    for i in range(num_batches):
      end_index = min(batch_size * (i+1), len(val_set))

      batch = val_set[i*batch_size:end_index]

      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])
      
      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)
        
      # Tell pytorch not to bother with constructing the compute graph during
      # the forward pass, since this is only needed for backprop (training).
      with torch.no_grad():        

        # Forward pass, calculate logit predictions.
        outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,
                                labels=b_labels)
        loss = outputs[0]#.loss
        logits = outputs[1]#.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().numpy()
        label_ids = b_labels.numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches. We provide a function in 
        # helpers.py called flat_accuracy() that takes in predictions and labels
        # acc = flat_accuracy(logits, label_ids)       
        # total_eval_accuracy += acc
        error = np.abs(logits-label_ids).sum()
    # Report the final accuracy for this validation run.
    avg_error = error / num_batches
    return avg_error 



In [58]:
import random

# training loop

# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    # num_batches = int(len(train_set)/batch_size) + 1
    num_batches = int(len(train_set)/batch_size)

    for i in range(num_batches):
      end_index = min(batch_size * (i+1), len(train_set))

      batch = train_set[i*batch_size:end_index]
      input_id_tensors = torch.stack([data[0] for data in batch])
      input_mask_tensors = torch.stack([data[1] for data in batch])
      label_tensors = torch.stack([data[2] for data in batch])

      # Move tensors to the GPU
      b_input_ids = input_id_tensors.to(device)
      b_input_mask = input_mask_tensors.to(device)
      b_labels = label_tensors.to(device)

      # Clear the previously calculated gradient
      model.zero_grad()        

      # Perform a forward pass (evaluate the model on this training batch).
      outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
      # print(outputs)
      loss = outputs[0]#.loss
      logits = outputs[1]#.logits

      total_train_loss += loss.item()

      # Perform a backward pass to calculate the gradients.
      loss.backward()

      # Update parameters and take a step using the computed gradient.
      optimizer.step()
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_error = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_error}")
    
print("")
print("Training complete!")



Training...
Total loss: 0.019743618555366993
Validation accuracy: 5.407197952270508

Training...
Total loss: 0.00995240593329072
Validation accuracy: 9.79728889465332

Training...
Total loss: 0.01938678789883852
Validation accuracy: 13.865350723266602

Training...
Total loss: 0.01795857585966587
Validation accuracy: 9.393569946289062

Training...
Total loss: 0.01085454411804676
Validation accuracy: 5.033558368682861

Training...
Total loss: 0.01806918578222394
Validation accuracy: 5.784292221069336

Training...
Total loss: 0.018869482446461916
Validation accuracy: 4.773448944091797

Training...
Total loss: 0.01139177568256855
Validation accuracy: 5.468070030212402

Training...
Total loss: 0.01669137179851532
Validation accuracy: 6.432164192199707

Training...
Total loss: 0.008767289575189352
Validation accuracy: 6.560471057891846

Training complete!


In [18]:
from keybert import KeyBERT

# doc = """
#          Supervised learning is the machine learning task of learning a function that
#          maps an input to an output based on example input-output pairs. It infers a
#          function from labeled training data consisting of a set of training examples.
#          In supervised learning, each example is a pair consisting of an input object
#          (typically a vector) and a desired output value (also called the supervisory signal). 
#          A supervised learning algorithm analyzes the training data and produces an inferred function, 
#          which can be used for mapping new examples. An optimal scenario will allow for the 
#          algorithm to correctly determine the class labels for unseen instances. This requires 
#          the learning algorithm to generalize from the training data to unseen situations in a 
#          'reasonable' way (see inductive bias).
#       """
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(tldr_cat, keyphrase_ngram_range=(1,3), stop_words='english', top_n = 10, use_maxsum=True, nr_candidates=40, use_mmr=True, diversity=0.7)
print(keywords)

[('neural architecture transformer', 0.5527), ('length disrupting', 0.1076), ('benchmarks covering summarization', 0.2611), ('language showed newly', 0.2474), ('explicitly allows spatial', 0.2433), ('replaces dot product', 0.2065), ('answering', 0.2506), ('finetune variety downstream', 0.0994), ('sampling algorithm hgsampling', -0.0439), ('nlp tasks layerdrop', 0.323)]


# Development Scratch

In [21]:
num_item = 30
offset = 0
query = 'hand'
query = '+'.join(query.split())
url = f'https://api.semanticscholar.org/graph/v1/paper/search?fos[0]=computer-science&query={query}&offset={offset}&limit={num_item}'
paper_list = str(request.urlopen(url).read(), 'utf-8')
paper_list = json.loads(paper_list)
# print(paper_list)

In [42]:
for paper in paper_list['data']:
    paper_id = paper['paperId']
    paper_url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=citations.authors,tldr,citationCount,fieldsOfStudy'
    paper_info = str(request.urlopen(paper_url).read(), 'utf-8')
    paper_info = json.loads(paper_info)
    print(paper_info['citationCount'], paper_info['fieldsOfStudy'])