In [1]:
import multiprocessing
import os

import bs4
import pandas as pd
import requests
from tqdm import tqdm_notebook as tqdm

In [2]:
class Conference:
    def __init__(self, host, year):
        self.papers_url = f"https://{host}/Conferences/{year}/Schedule"
        self.paper_url = f"https://{host}/Conferences/{year}/Schedule?showEvent={{}}"
        self.author_url = f"https://{host}/Conferences/{year}/Schedule?showSpeaker={{}}"
        
def load_paper_ids(url):
    response = requests.get(url)
    doc = bs4.BeautifulSoup(response.text)

    cards = doc.select(".maincard.Poster")

    return [c.attrs["id"][9:] for c in cards]

def load_paper(link):
    response = requests.get(link)
    doc = bs4.BeautifulSoup(response.text)
    box = doc.select(".maincard")[0].parent
    title = box.select(".maincardBody")[0].text.strip()
    authors = [(b.text.strip()[:-2].strip(), b.attrs["onclick"][13:-3]) for b in box.findAll("button")]

    return title, authors

def load_author(link):
    response = requests.get(link)
    doc = bs4.BeautifulSoup(response.text)
    box = doc.select(".maincard")[0].parent
    name = box.find("h3").text.strip()
    affiliation = box.find("h4").text.strip()
    
    return name, affiliation

def scrape_conference(pool, conference):
    paper_ids = load_paper_ids(conference.papers_url)
    paper_links = [conference.paper_url.format(id) for id in paper_ids]
    papers = list(tqdm(pool.imap(load_paper, paper_links), total=len(paper_links)))
    
    author_ids = [id for _, authors in papers for name, id in authors]
    author_links = set(conference.author_url.format(id) for id in author_ids)
    authors = list(tqdm(pool.imap(load_author, author_links), total=len(author_links)))
    
    affiliations = dict(authors)
    papers = [(title, [(name, affiliations[name]) for name, _ in authors]) 
              for title, authors in papers]

    unnormalized = [(title, author, affiliation)
                    for title, authors in papers
                    for author, affiliation in authors]

    return pd.DataFrame(unnormalized, columns=["Title", "Author", "Affiliation"])

In [3]:
icml_papers = []
for year in [2017, 2018]:
    conf = Conference("icml.cc", str(year))

    with multiprocessing.Pool(3 * os.cpu_count()) as pool:
        papers = scrape_conference(pool, conf)
        papers.insert(0, "Conference", "ICML")
        papers.insert(1, "Year", year)
        
    icml_papers.append(papers)

HBox(children=(IntProgress(value=0, max=433), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1584), HTML(value='')))




HBox(children=(IntProgress(value=0, max=621), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2314), HTML(value='')))




In [None]:
nips_papers = []
for year in [2016, 2017, 2018]:
    conf = Conference("nips.cc", str(year))

    with multiprocessing.Pool(3 * os.cpu_count()) as pool:
        papers = scrape_conference(pool, conf)
        papers.insert(0, "Conference", "NIPS")
        papers.insert(1, "Year", year)
        
    nips_papers.append(papers)

HBox(children=(IntProgress(value=0, max=568), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1956), HTML(value='')))




HBox(children=(IntProgress(value=0, max=431), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1519), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [None]:
iclr_papers = []
for year in [2018]:
    conf = Conference("iclr.cc", str(year))

    with multiprocessing.Pool(3 * os.cpu_count()) as pool:
        papers = scrape_conference(pool, conf)
        papers.insert(0, "Conference", "ICLR")
        papers.insert(1, "Year", year)
        
    iclr_papers.append(papers)

HBox(children=(IntProgress(value=0, max=336), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1412), HTML(value='')))

In [None]:
all_papers = pd.concat(icml_papers + nips_papers + iclr_papers)

# Fix multiple spaces in author names
all_papers["Author"] = all_papers["Author"].replace("\s+", " ", regex=True)

all_papers.to_csv("papers.csv", index=False)