In [6]:
import os
import re
import json
import pickle
import requests

from bs4 import BeautifulSoup
from urllib.parse import urljoin


In [7]:
nips_papers = 'https://papers.nips.cc/'
html_text = requests.get(nips_papers).text
soup = BeautifulSoup(html_text, 'html.parser')


In [8]:
def get_num_citations(title, authors, year):
    ### Has issues, check
    ### Has issues, check
    authors = '+'.join(['+'.join([aa['given_name'], aa['family_name']]) for aa in authors])
    req_string = 'https://scholar.google.co.uk/scholar?as_q=&as_epq={title}&as_occt=title&as_sauthors={authors}&as_publication='.format(
        title=title, authors=authors)
    html_text = requests.get(req_string).text
    re_result = re.search('>Cited by (.*?)</a>', html_text)
    num_citations = None if re_result is None else int(re_result.group(1))
    return num_citations

def get_reviews(html_handle):
    reviewer_soup = BeautifulSoup(html_handle, 'html.parser')
    reviewer_dict = {}
    for reviewer in reviewer_soup.find_all('h3'):
        review_text = ''
        for sib in reviewer.find_next_siblings():
            if sib.name == "h3":
                break
            else:
                review_text += ' ' + sib.text
        re_result = re.search('Confidence in this Review (.*?)-', review_text)
        review_conf = None if re_result is None else int(re_result.group(1))
        reviewer_dict[reviewer.contents[0]] = {
            'text': review_text, 'confidence': review_conf}
    return reviewer_dict
    

In [9]:
# Check if there is already some data
if os.path.isfile('neurips_conf_data.pkl'):
    with open('neurips_conf_data.pkl', 'rb') as handle:
        conf_data = pickle.load(handle)
else:
    conf_data = {}


In [None]:

# Loop through all conference years
all_conferences = [cc for cc in soup.find_all('li') if 'paper' in cc.a.get('href')]
all_conferences = all_conferences[::-1]
for cc in all_conferences[len(conf_data):]:
    conf_link = urljoin(nips_papers, cc.a.get('href'))
    conf_year = conf_link.split('/')[-1]  
    html_text = requests.get(conf_link).text
    conf = BeautifulSoup(html_text, 'html.parser')
    
    # Loop through all current conference's papers
    print("\n\nProcessing: ", cc.a.contents[0])
    paper_list = []
    all_papers = [pp for pp in conf.find_all('li') if 'paper' in pp.a.get('href')]
    for pi, pp in enumerate(all_papers):
        # Get paper info
        print(" - paper [{}/{}]: {}".format(pi + 1, len(all_papers), pp.a.contents[0]))
        paper_link = urljoin(conf_link, pp.a.get('href')) 
        link_file = paper_link.replace('hash', 'file')
        # Extract paper metadata
        link_meta = link_file.replace('html', 'json')
        link_meta = link_meta.replace('Abstract', 'Metadata')
        html_text = requests.get(link_meta).text
        if html_text == 'Resource Not Found':
            html_ = requests.get(paper_link).text
            conf = BeautifulSoup(html_, 'html.parser')
            abstract_text = conf.find('h4', string='Abstract').next_sibling.next_sibling.contents[0]
            abstract = None if abstract_text == 'Abstract Unavailable' else abstract_text
            abstract = abstract.replace('<p>', '')
            abstract = abstract.replace('</p>', '')
            abstract = abstract.replace('\n', ' ')
            author_list = [
                {'given_name': aa.split(' ')[0],
                 'family_name': aa.split(' ')[1],
                 'institution': None} for aa in pp.i.contents[0].split(', ')]
            paper_meta = {
                'title': pp.a.contents[0],
                'authors': author_list,
                'abstract': abstract,
                'full_text': None
            }
        else:
            paper_meta = json.loads(html_text)
            if 'full_text' in paper_meta.keys():
                paper_meta['full_text'] = paper_meta['full_text'].replace('\n', ' ')
        # Extract paper supplemental
        link_supplement = link_file.replace('html', 'zip')
        link_supplement = link_supplement.replace('Abstract', 'Supplemental')
        html_text = requests.get(link_supplement).text
        if html_text == 'Resource Not Found':
            has_zip = False
        else:
            has_zip = True
        link_supplement = link_supplement.replace('zip', 'pdf')
        html_text = requests.get(link_supplement).text
        if html_text == 'Resource Not Found':
            has_pdf = False
        else:
            has_pdf = True
        has_supplement = has_pdf or has_zip
        # Extract paper reviews
        link_review = link_file.replace('Abstract', 'Reviews')
        html_text = requests.get(link_review).text
        if html_text == 'Resource Not Found':
            reviews = None
        else:
            reviews = get_reviews(html_text)
        # Extract scholar citation data
        num_cit = get_num_citations(title=paper_meta['title'], authors=paper_meta['authors'], year=conf_year)
        # Update paper info
        paper_meta.update({
            'year': conf_year,
            'citations': num_cit,
            'institutions': list(set([aa['institution'] for aa in paper_meta['authors']])),
            'reviews': reviews,
            'has_supplement': has_supplement})
        paper_list.append(paper_meta)
        
    # Update conference info
    conf_data[conf_year] = paper_list
    with open('neurips_conf_data.pkl', 'wb') as handle:
        pickle.dump(conf_data, handle, protocol=pickle.HIGHEST_PROTOCOL)




Processing:  Advances in Neural Information Processing Systems 33  pre-proceedings (NeurIPS 2020)
 - paper [1/1898]: A graph similarity for deep learning
 - paper [2/1898]: An Unsupervised Information-Theoretic Perceptual Quality Metric
 - paper [3/1898]: Self-Supervised MultiModal Versatile Networks
 - paper [4/1898]: Benchmarking Deep Inverse Models over time, and the Neural-Adjoint method
 - paper [5/1898]: Off-Policy Evaluation and Learning for External Validity under a Covariate Shift
 - paper [6/1898]: Neural Methods for Point-wise Dependency Estimation
 - paper [7/1898]: Fast and Flexible Temporal Point Processes with Triangular Maps
 - paper [8/1898]: Backpropagating Linearly Improves Transferability of Adversarial Examples
 - paper [9/1898]: PyGlove: Symbolic Programming for Automated Machine Learning
 - paper [10/1898]: Fourier Sparse Leverage Scores and Approximate Kernel Learning
 - paper [11/1898]: Improved Algorithms for Online Submodular Maximization via First-order Re