In [1]:
import sys, os
import numpy as np
from numpy import genfromtxt
import pandas as pd
from scipy import sparse
from scipy.cluster import hierarchy
from scipy.spatial import distance

import networkx as nx
from collections import Counter

from bs4 import BeautifulSoup
import urllib.request
import re, json, requests, itertools
from tqdm.notebook import tqdm

# Plotting packages
import matplotlib.pyplot as plt
plt.rcdefaults()
import matplotlib.colors as colors
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.cm as cmx
import pylab

import seaborn as sns 
sns.set(style="ticks", color_codes=True)
sns.set_style("white")
sns.set_style({'xtick.bottom': True, 'ytick.left': True})
colorref = ["gray", "royalblue", "crimson", "goldenrod", "mediumorchid", "seagreen"]

# import crossref
from crossref.restful import Works, Prefixes

# iPython magic commands
%matplotlib notebook
%load_ext autoreload
%autoreload 2
%autosave 30

SMALL_SIZE = 12
MEDIUM_SIZE = 12
BIG_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIG_SIZE)  # fontsize of the figure title
cust_palette = sns.color_palette("Paired")[6:10]
cust_palette = [cust_palette[i] for i in [1,0,3,2]]

Autosaving every 30 seconds


### Scrape ACM proceedings using CrossRef (ACM assigns DOIs!)

In [26]:
# This is for completeness
acm = {
    "facct2019": "https://dl.acm.org/doi/proceedings/10.1145/3287560",
    "facct2020": "https://dl.acm.org/doi/proceedings/10.1145/3351095",
    "aies2018": "https://dl.acm.org/doi/proceedings/10.1145/3278721",
    "aies2019": "https://dl.acm.org/doi/proceedings/10.1145/3306618",
    "aies2020": "https://dl.acm.org/doi/proceedings/10.1145/3375627"
}
# This is what we actually use
acm_isbn = {
    "facct2019": "9781450361255",
    "facct2020": "9781450369367",
    "aies2018": "9781450360128",
    "aies2019": "9781450363242",
    "aies2020": "9781450371100",
}

# CHANGE ME TO CHANGE CONFERENCE AND YEAR
conf = "facct"
year = 2019

In [27]:
works = Works()
df = pd.DataFrame(columns=["paper_id", "given", "family", "affiliation", "suffix"])
failct = 0
for paper in works.filter(isbn=acm_isbn[conf+str(year)]):
    try:
        authorblock = paper["author"]
        for i in range(len(authorblock)):
            authorblock[i]["affiliation"] = authorblock[i]["affiliation"][0]["name"] \
                    if type(authorblock[i]["affiliation"]) == list else authorblock[i]["affiliation"]["name"]
        authorblock = pd.DataFrame(authorblock).drop("sequence", axis=1)
        authorblock.insert(0, "paper_id", int(paper["DOI"].split("/")[-1].replace(".","")))
        authorblock.insert(1, "title", paper["title"][0] if isinstance(paper["title"], list) else paper["title"])
        df = df.append(authorblock)
    except:
        failct += 1
        pass

df = df.reset_index()
df.rename(columns={"given": "given_name", "family": "family_name", "affiliation": "institution", "suffix": "email"}, inplace=True)
df.insert(0, "year", year)
df.insert(0, "conference", conf)
print("Failed to process %d entries; at least 1 should fail" % failct)

df["institution"] = df["institution"].str.lower()
df["institution"] = df["institution"].str.replace(r'[^a-zA-Z ]\s?', r'', regex=True)
df.head()

Failed to process 4 entries; at least 1 should fail


Unnamed: 0,conference,year,index,paper_id,given_name,family_name,institution,email,title
0,facct,2019,0,32875603287561,Ran,Canetti,boston university and tel aviv university,,From Soft Classifiers to Hard Decisions
1,facct,2019,1,32875603287561,Aloni,Cohen,mit,,From Soft Classifiers to Hard Decisions
2,facct,2019,2,32875603287561,Nishanth,Dikkala,mit,,From Soft Classifiers to Hard Decisions
3,facct,2019,3,32875603287561,Govind,Ramnarayan,mit,,From Soft Classifiers to Hard Decisions
4,facct,2019,4,32875603287561,Sarah,Scheffler,boston university,,From Soft Classifiers to Hard Decisions


### Post-process organization from affiliation

Attempt 1: COREGISTRATION Infer likely email domain using NeurIPS dataset

In [10]:
# Load file (from same directory as the notebook)
neurips = pd.read_excel(os.path.expanduser("neurips-2015-19.xlsx"))
neurips["institution"] = neurips["institution"].str.lower()
neurips["institution"] = neurips["institution"].str.replace(r'[^a-zA-Z ]\s?',r'',regex=True)
orglut = neurips.groupby(["org", "institution"]).size().reset_index(name='counts')
orglut = orglut.loc[orglut.apply(lambda x: len(x["institution"]), axis="columns") > 1, :]
orglut.head()

Unnamed: 0,org,institution,counts
0,126,tencent ai lab,2
1,126,university of international business and econo...,1
2,126,xidian university,1
3,163,national university of defense technology,4
4,163,peking university,1


In [11]:
def match_neurips_org(person):
    inst = person.institution if person.institution[-3:] != "usa" else person.institution[:-3]
    substring_attempt = orglut.loc[[str(orgmode) in inst for orgmode in orglut.institution], :]
    if len(substring_attempt) > 0:
        substring_attempt = substring_attempt.sort_values("counts", ascending=False)
        return substring_attempt.iloc[0]["org"]
    return ""
    
print(df.iloc[3])
print("\n\nInferred: ", match_neurips_org(df.iloc[3]))

conference                                     aies
year                                           2020
index                                             3
paper_id                             33756273375807
given_name                                  Stephen
family_name                                  Lorenz
institution         clarkson universitypotsdamnyusa
email                                           NaN
title          When Trusted Black Boxes Don't Agree
Name: 3, dtype: object


Inferred:  


In [28]:
df.insert(6, "org", df.apply(match_neurips_org, axis="columns"))
df.head(20)

Unnamed: 0,conference,year,index,paper_id,given_name,family_name,org,institution,email,title
0,facct,2019,0,32875603287561,Ran,Canetti,bu,boston university and tel aviv university,,From Soft Classifiers to Hard Decisions
1,facct,2019,1,32875603287561,Aloni,Cohen,mit,mit,,From Soft Classifiers to Hard Decisions
2,facct,2019,2,32875603287561,Nishanth,Dikkala,mit,mit,,From Soft Classifiers to Hard Decisions
3,facct,2019,3,32875603287561,Govind,Ramnarayan,mit,mit,,From Soft Classifiers to Hard Decisions
4,facct,2019,4,32875603287561,Sarah,Scheffler,bu,boston university,,From Soft Classifiers to Hard Decisions
5,facct,2019,5,32875603287561,Adam,Smith,bu,boston university,,From Soft Classifiers to Hard Decisions
6,facct,2019,0,32875603287568,Jake,Goldenfein,cornell,cornell technew yorknew york and cornell techc...,,The Profiling Potential of Computer Vision and...
7,facct,2019,0,32875603287599,Hussein,Mouzannar,usc,american university of beirut,,From Fair Decision Making To Social Equality
8,facct,2019,1,32875603287599,Mesrob I.,Ohannessian,ttic,toyota technological institute at chicago,,From Fair Decision Making To Social Equality
9,facct,2019,2,32875603287599,Nathan,Srebro,ttic,toyota technological institute at chicago,,From Fair Decision Making To Social Equality


In [29]:
with open("./data/"+conf+"-"+str(year)+"-titled.csv", "w", newline="", errors="ignore") as f:
    df.to_csv(path_or_buf=f, index=False, encoding="utf-8", errors="ignore")

---

---

---

### Create dataframe using scraped data

In [44]:
df = pd.DataFrame(columns=["paper_id", "given_name", "family_name", "institution", "email"])

author_emails = []
# author_institutions = []

for paper in tqdm(papers):
    papermetaurl = url + "/file/" + paper["href"].split("/")[-1][:-13] + "Metadata.json"
    # requests approach: given that the output is a json, parse it directly (faster but not error-proof)
    rsp = requests.get(papermetaurl)
    d = rsp.json()
    # HTTP-generic BS4 approach: (more intensive)
#     metapage = urllib.request.urlopen(papermetaurl)
#     metasoup = BeautifulSoup(metapage.read(),"html.parser")
#     d = json.loads(metasoup.text)
    # Retrieve basic author block: names and affiliations + paperID for grouping
    authorblock = pd.DataFrame(d["authors"])
    authorblock.insert(0, "paper_id", d["sourceid"])
    # Retrieve author emails (if possible)
    try:
        fulltext = d["full_text"]
        stop = re.search(r"Abstract", fulltext).start(0)
        raw = re.sub(r"\n", " ", fulltext[:stop])
        raw = re.sub(r"({|})", "", raw)
        raw = re.sub(r", ", ",", raw)
        emails = re.findall(r"[\w][\w,\.-]+@[\w\.-]+", raw)
        # Additionally attempt to unpack grouped emails of the form {author1, author2, author3}@the.same.org.edu
        parsed_emails = []
        for email in emails:
            splitemail = email.split(",")
            if len(splitemail) > 1: # only true if there actually was a comma
                lastemail, commondomain = splitemail[-1].split("@")
                splitemail = [s+"@"+commondomain for s in splitemail[:-1]] + [splitemail[-1]]
            parsed_emails.append(splitemail)
        emails = list(itertools.chain(*parsed_emails))        
        # If the number of emails matches the number of authors, SUCCESS & include in data
        if len(authorblock) == len(emails):
            authorblock.insert(4, "email", emails)
    except:
        # The full-text is not always present in the metadata; just keep going
        pass

    df = df.append(authorblock)
df = df.reset_index()
df.insert(0, "year", year)
df.insert(0, "conference", conf)

  0%|          | 0/1428 [00:00<?, ?it/s]

### Construct organization variable from email domains

In [45]:
badtails = [
    "com", "edu", "org", "net", "ai", "io", "gov", "co", 
    "ac", "uk", "sg", "cn", "fr", "de", "ch", "cz", "ca", "hk", "tw", "il", "au", "dk", "pl", "it", "jp", "be", "kr", "ru", "uy", "qa", "br", "vn", "sa", "se", 
]

# test = "maddie@eecs.berkeley.ac.uk" # This should work: "berkeley"
# test = "maddie@google" # This shouldn't

def extract_org(email):
    try:
        (user, domain) = email.split("@")
        # Get CORE domain == org, s.t. eecs.berkeley.edu and berkeley.edu map to the same organization
        subdomains = domain.split(".")
        subdomains.pop()
        while subdomains[-1] in badtails: # Until the subdomain is not a country code or etc.
            subdomains.pop()
        return subdomains[-1]
    except:
        return np.nan
    
org = df["email"].apply(extract_org)
df.insert(6, "org", org)

In [47]:
with open("./data/"+conf+"-"+year+".csv", "w", newline="", errors="ignore") as f:
    df.to_csv(path_or_buf=f, index=False, encoding="utf-8", errors="ignore")