In [3]:
import urllib
from urllib import request
from bs4 import BeautifulSoup
import os
import requests
import argparse
import re
import time
import json
import math
import yake
import pandas as pd

import codecs
import urllib.parse as up

# Crawler and Semantic Scholar Information Extractor

In [30]:
# Generate a concatenated tldr string of papers from a certain query. Also return the information pack of these papers.
def SSSQuery(query, num_item=50, offset=0, fos=None):
    # Send a Semantic API post to get the result paper list
    query = '+'.join(query.split())
    # query_num = int(num_item*1.1)
    url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={query}&offset={offset}&limit={num_item}&fields=fieldsOfStudy,abstract'
    print(url)
    paper_list = str(request.urlopen(url).read(), 'utf-8')
    paper_list = json.loads(paper_list)

    tldr_cat = ''
    info_pack = []
    # Loop through the paper list and get information pack for each paper
    for paper in paper_list['data']:
        paper_id = paper['paperId']
        # Each paper info pack consists title, abstract, authors, tldr, citation count, field of study
        paper_url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=title,abstract,citations.authors,tldr,citationCount,influentialCitationCount,fieldsOfStudy'
        paper_info = str(request.urlopen(paper_url).read(), 'utf-8')
        paper_info = json.loads(paper_info)
        # Filter out the papers out of field of study
        if fos is not None and fos not in paper_info['fieldsOfStudy']:
            continue
        tldr = paper_info['tldr']
        if tldr is not None:
            tldr_cat += ' '+tldr['text'].strip()
            info_pack.append(paper_info)
    tldr_cat = tldr_cat.strip()
    return tldr_cat, info_pack

In [31]:
tldr_cat, info_pack = SSSQuery('transformer', num_item=100, fos='Computer Science')
print([i['citationCount'] for i in info_pack])
print([i['influentialCitationCount'] for i in info_pack])

https://api.semanticscholar.org/graph/v1/paper/search?query=transformer&offset=0&limit=100&fields=fieldsOfStudy,abstract
[4114, 2267, 1421, 533, 493, 320, 148, 185, 187, 138, 124, 127, 116, 356, 240, 172, 102, 80, 68, 231, 185, 167, 161, 394, 167, 282, 299, 176, 144, 52, 219, 124, 143, 182, 102, 107, 446, 92, 287, 134, 77, 67, 61, 58, 57, 55, 62, 44, 78, 80, 108, 130, 92, 62, 168, 153, 95, 92, 45, 88, 39, 77, 62, 60, 141, 111, 93, 256, 177]
[527, 416, 183, 131, 72, 63, 39, 50, 31, 27, 9, 16, 9, 70, 42, 43, 6, 24, 9, 30, 29, 31, 29, 124, 27, 25, 20, 7, 29, 2, 42, 33, 21, 53, 7, 14, 26, 7, 26, 43, 11, 3, 8, 11, 3, 3, 1, 7, 25, 14, 15, 10, 15, 6, 10, 8, 8, 8, 8, 13, 5, 16, 1, 1, 7, 18, 5, 16, 4]


In [10]:
info_pack[0]['tldr']['text']

'This work introduces a new learnable module, the Spatial Transformer, which explicitly allows the spatial manipulation of data within the network, and can be inserted into existing convolutional architectures, giving neural networks the ability to actively spatially transform feature maps.'

In [34]:
df = pd.DataFrame({'text': [i['tldr']['text'] for i in info_pack], 'citationCount': [i['citationCount'] for i in info_pack], 'influentialCitationCount':[i['influentialCitationCount'] for i in info_pack]})
df.head()
df.to_csv('transformer_citation.csv', index=None)

In [5]:
tldr_cat

'This work introduces a new learnable module, the Spatial Transformer, which explicitly allows the spatial manipulation of data within the network, and can be inserted into existing convolutional architectures, giving neural networks the ability to actively spatially transform feature maps. This systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks and achieves state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. This work proposes a novel neural architecture Transformer-XL that enables learning dependency beyond a fixed length without disrupting temporal coherence, which consists of a segment-level recurrence mechanism and a novel positional encoding scheme. Following prior work on long-sequence transformers, the Longformer is evaluated on character-level language modeling and achieves state-of-the-art result

# Keywords Extraction

In [6]:
language = "en"
max_ngram_size = 1
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 20

custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
keywords = custom_kw_extractor.extract_keywords(tldr_cat)
keywords = sorted(keywords, key=lambda x:x[1])
for kw in keywords:
    print(kw)

('Spatial', 0.03213113441634914)
('work', 0.04798022059872212)
('language', 0.06973861700167353)
('Transformer', 0.07994217878795795)
('tasks', 0.08425923835568348)
('architectures', 0.12034394262796394)
('Longformer', 0.1231255301810972)
('neural', 0.13795581464104506)
('training', 0.13840506098154778)
('module', 0.14654904914113015)
('giving', 0.14654904914113015)
('maps', 0.14654904914113015)
('achieves', 0.1514314025909243)
('results', 0.1514314025909243)
('introduces', 0.15751851687783422)
('learnable', 0.15751851687783422)
('explicitly', 0.15751851687783422)
('manipulation', 0.15751851687783422)
('data', 0.15751851687783422)
('inserted', 0.15751851687783422)


In [19]:
from keybert import KeyBERT

# doc = """
#          Supervised learning is the machine learning task of learning a function that
#          maps an input to an output based on example input-output pairs. It infers a
#          function from labeled training data consisting of a set of training examples.
#          In supervised learning, each example is a pair consisting of an input object
#          (typically a vector) and a desired output value (also called the supervisory signal). 
#          A supervised learning algorithm analyzes the training data and produces an inferred function, 
#          which can be used for mapping new examples. An optimal scenario will allow for the 
#          algorithm to correctly determine the class labels for unseen instances. This requires 
#          the learning algorithm to generalize from the training data to unseen situations in a 
#          'reasonable' way (see inductive bias).
#       """
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(tldr_cat, keyphrase_ngram_range=(1,3), stop_words='english', top_n = 10, nr_candidates=40, use_mmr=True, diversity=0.7)
print(keywords)

[('neural architecture transformer', 0.5527), ('length disrupting', 0.1076), ('benchmarks covering summarization', 0.2611), ('language showed newly', 0.2474), ('explicitly allows spatial', 0.2433), ('replaces dot product', 0.2065), ('answering', 0.2506), ('finetune variety downstream', 0.0994), ('sampling algorithm hgsampling', -0.0439), ('nlp tasks layerdrop', 0.323)]


# Development Scratch

In [21]:
num_item = 30
offset = 0
query = 'hand'
query = '+'.join(query.split())
url = f'https://api.semanticscholar.org/graph/v1/paper/search?fos[0]=computer-science&query={query}&offset={offset}&limit={num_item}'
paper_list = str(request.urlopen(url).read(), 'utf-8')
paper_list = json.loads(paper_list)
# print(paper_list)

In [42]:
for paper in paper_list['data']:
    paper_id = paper['paperId']
    paper_url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=citations.authors,tldr,citationCount,fieldsOfStudy'
    paper_info = str(request.urlopen(paper_url).read(), 'utf-8')
    paper_info = json.loads(paper_info)
    print(paper_info['citationCount'], paper_info['fieldsOfStudy'])