In [1]:
# Core analysis packages
import numpy as np
import os, sys
import pandas as pd
from scipy import stats
from scipy.special import comb
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats import anova
# from patsy import dmatrices
import bff
import pingouin as pg

# Plotting packages
import matplotlib.pyplot as plt
plt.rcdefaults()
import matplotlib.colors as colors
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm as cmx
import pylab

import seaborn as sns 
sns.set(style="ticks", color_codes=True)
sns.set_style("white")
sns.set_style({'xtick.bottom': True, 'ytick.left': True})
colorref = ["gray", "royalblue", "crimson", "goldenrod", "mediumorchid", "seagreen"]

# iPython magic commands
%matplotlib notebook
%load_ext autoreload
%autoreload 2
%autosave 30

SMALL_SIZE = 12
MEDIUM_SIZE = 12
BIG_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIG_SIZE)  # fontsize of the figure title
cust_palette = sns.color_palette("Paired")[6:10]
cust_palette = [cust_palette[i] for i in [1,0,3,2]]

  **kwargs


Autosaving every 30 seconds


### Scrape NeurIPS proceedings metadata

In [24]:
import numpy as np
from numpy import genfromtxt
import pandas as pd
from scipy import sparse
from scipy.cluster import hierarchy
from scipy.spatial import distance

import networkx as nx

from bs4 import BeautifulSoup
import urllib.request
import re, json, requests, itertools
from tqdm.notebook import tqdm

home = "https://proceedings.neurips.cc"
years = ["/paper/2015", "/paper/2016", "/paper/2017", "/paper/2018", "/paper/2019"]
# EDIT THE FOLLOWING LINE TO CHANGE WHICH YEAR'S DATA IS COLLECTED:
url = home+years[-5]

page = urllib.request.urlopen(url)
soup = BeautifulSoup(page.read(),"html.parser")
papers = soup.findAll("a", href=True)[3:-1]

# Extra annotations
conf = "neurips"
year = url[-4:]

### Create dataframe using scraped data

In [25]:
df = pd.DataFrame(columns=["paper_id", "given_name", "family_name", "institution", "email"])

author_emails = []
# author_institutions = []

for paper in tqdm(papers):
    papermetaurl = url + "/file/" + paper["href"].split("/")[-1][:-13] + "Metadata.json"
    # requests approach: given that the output is a json, parse it directly (faster but not error-proof)
    rsp = requests.get(papermetaurl)
    d = rsp.json()
    # HTTP-generic BS4 approach: (more intensive)
#     metapage = urllib.request.urlopen(papermetaurl)
#     metasoup = BeautifulSoup(metapage.read(),"html.parser")
#     d = json.loads(metasoup.text)
    # Retrieve basic author block: names and affiliations + paperID for grouping
    authorblock = pd.DataFrame(d["authors"])
    authorblock.insert(0, "paper_id", d["sourceid"])
    authorblock.insert(1, "title", d["title"])
    # Retrieve author emails (if possible)
    try:
        fulltext = d["full_text"]
        stop = re.search(r"Abstract", fulltext).start(0)
        raw = re.sub(r"\n", " ", fulltext[:stop])
        raw = re.sub(r"({|})", "", raw)
        raw = re.sub(r", ", ",", raw)
        emails = re.findall(r"[\w][\w,\.-]+@[\w\.-]+", raw)
        # Additionally attempt to unpack grouped emails of the form {author1, author2, author3}@the.same.org.edu
        parsed_emails = []
        for email in emails:
            splitemail = email.split(",")
            if len(splitemail) > 1: # only true if there actually was a comma
                lastemail, commondomain = splitemail[-1].split("@")
                splitemail = [s+"@"+commondomain for s in splitemail[:-1]] + [splitemail[-1]]
            parsed_emails.append(splitemail)
        emails = list(itertools.chain(*parsed_emails))        
        # If the number of emails matches the number of authors, SUCCESS & include in data
        if len(authorblock) == len(emails):
            authorblock.insert(4, "email", emails)
    except:
        # The full-text is not always present in the metadata; just keep going
        pass

    df = df.append(authorblock)
df = df.reset_index()
df.insert(0, "year", year)
df.insert(0, "conference", conf)

  0%|          | 0/403 [00:00<?, ?it/s]

### Construct organization variable from email domains

In [26]:
badtails = [
    "com", "edu", "org", "net", "ai", "io", "gov", "co", 
    "ac", "uk", "sg", "cn", "fr", "de", "ch", "cz", "ca", "hk", "tw", "il", "au", "dk", "pl", "it", "jp", "be", "kr", "ru", "uy", "qa", "br", "vn", "sa", "se", 
]

# test = "maddie@eecs.berkeley.ac.uk" # This should work: "berkeley"
# test = "maddie@google" # This shouldn't

def extract_org(email):
    try:
        (user, domain) = email.split("@")
        # Get CORE domain == org, s.t. eecs.berkeley.edu and berkeley.edu map to the same organization
        subdomains = domain.split(".")
        subdomains.pop()
        while subdomains[-1] in badtails: # Until the subdomain is not a country code or etc.
            subdomains.pop()
        return subdomains[-1]
    except:
        return np.nan
    
org = df["email"].apply(extract_org)
df.insert(6, "org", org)

In [27]:
df.head(20)

Unnamed: 0,conference,year,index,paper_id,given_name,family_name,org,institution,email,title
0,neurips,2015,0,550,Zheng,Qu,hku,University of Hong Kong,zhengqu@maths.hku.hk,Quartz: Randomized Dual Coordinate Ascent with...
1,neurips,2015,1,550,Peter,Richtarik,ed,University of Edinburgh,peter.richtarik@ed.ac.uk,Quartz: Randomized Dual Coordinate Ascent with...
2,neurips,2015,2,550,Tong,Zhang,rutgers,Rutgers,tzhang@stat.rutgers.edu,Quartz: Randomized Dual Coordinate Ascent with...
3,neurips,2015,0,1559,Arya,Mazumdar,umn,University of Minnesota -- Twin Cities,arya@umn.edu,Associative Memory via a Sparse Recovery Model
4,neurips,2015,1,1559,Ankit Singh,Rawat,cmu,Carnegie Mellon University,asrawat@andrew.cmu.edu,Associative Memory via a Sparse Recovery Model
5,neurips,2015,0,890,Aviv,Tamar,berkeley,Technion,avivt@berkeley.edu,Policy Gradient for Coherent Risk Measures
6,neurips,2015,1,890,Yinlam,Chow,inria,Stanford,mohammad.ghavamzadeh@inria.fr,Policy Gradient for Coherent Risk Measures
7,neurips,2015,2,890,Mohammad,Ghavamzadeh,stanford,Adobe Research & INRIA,ychow@stanford.edu,Policy Gradient for Coherent Risk Measures
8,neurips,2015,3,890,Shie,Mannor,technion,Technion,shie@ee.technion.ac.il,Policy Gradient for Coherent Risk Measures
9,neurips,2015,0,134,Miguel,Carreira-Perpinan,,UC Merced,,"A fast, universal algorithm to learn parametri..."


In [28]:
with open("./data/"+conf+"-"+year+"-titled.csv", "w", newline="", errors="ignore") as f:
    df.to_csv(path_or_buf=f, index=False, encoding="utf-8", errors="ignore")

**STOP**

---

Some unit tests / debug / deprecated code:

### Upload the compiled dataset containing all scraped years

In [2]:
# Load file (from same directory as the notebook)
df = pd.read_excel(os.path.expanduser("neurips-2015-19.xlsx"))
df.head()

Unnamed: 0,conference,year,index,paper_id,given_name,family_name,org,institution,email
0,neurips,2015,0,550,Zheng,Qu,hku,University of Hong Kong,zhengqu@maths.hku.hk
1,neurips,2015,1,550,Peter,Richtarik,ed,University of Edinburgh,peter.richtarik@ed.ac.uk
2,neurips,2015,2,550,Tong,Zhang,rutgers,Rutgers,tzhang@stat.rutgers.edu
3,neurips,2015,0,1559,Arya,Mazumdar,umn,University of Minnesota -- Twin Cities,arya@umn.edu
4,neurips,2015,1,1559,Ankit Singh,Rawat,cmu,Carnegie Mellon University,asrawat@andrew.cmu.edu
