In [1]:
import pandas as pd
from tqdm import tqdm
import arxiv
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv("nips2019.csv")

In [3]:
df[:10] # not every accepted papers in NIPS 2019 available on arxiv today

Unnamed: 0,title,authors,arxiv_id
0,Blind Super-Resolution Kernel Estimation using...,Sefi Bell-Kligler;Assaf Shocher;Michal Irani,
1,Guided Similarity Separation for Image Retrieval,Chundi Liu;Guangwei Yu;Maksims Volkovs;Cheng C...,
2,"Average Individual Fairness: Algorithms, Gener...",Saeed Sharifi-Malvajerdi;Michael Kearns;Aaron ...,http://arxiv.org/abs/1905.10607v1
3,Greedy InfoMax for Biologically Plausible Self...,Sindy Löwe;Peter O'Connor;Bastiaan Veeling,
4,Dynamics of stochastic gradient descent for tw...,Sebastian Goldt;Madhu Advani;Andrew Saxe;Flore...,
5,Parameter elimination in particle Gibbs sampling,Anna Wigren;Riccardo Sven Risuleo;Lawrence Mur...,
6,Nonparametric Density Estimation & Convergence...,Ananya Uppal;Shashank Singh;Barnabas Poczos,
7,On Robustness of Principal Component Regression,Anish Agarwal;Devavrat Shah;Dennis Shen;Dogyoo...,
8,Scalable Bayesian inference of dendritic volta...,Ruoxi Sun; Ian Kinsella;Scott Linderman;Liam ...,
9,Optimizing Generalized Rate Metrics through Th...,Harikrishna Narasimhan;Andrew Cotter;Maya Gupta,


In [4]:
print("Only {:.2f} % of NIPS 2019 papers available on arxiv by {}".format(len(df.dropna())/len(df)*100, datetime.now()))

Only 16.29 % of NIPS 2019 papers available on arxiv by 2019-09-13 09:47:12.238688


In [5]:
def retrieve_arxiv(title, authors, k=5):
    query = "title:{} au:{}".format(title, authors[0].lower().replace(" ", "_"))
    results = arxiv.query(query, max_results=k)
    found = False
    record = None
    for result in results:
        if title.strip().lower() == result["title"].strip().lower():
            if set(authors) == set(result["authors"]):
                found = True
                record = result
                break
    return record, found, results

In [6]:
retrieve_arxiv(title=df.loc[0]["title"], authors=df.loc[0]["authors"].split(";"))

(None,
 False,
 [{'id': 'http://arxiv.org/abs/1904.00523v1',
   'guidislink': True,
   'updated': '2019-04-01T01:14:23Z',
   'updated_parsed': time.struct_time(tm_year=2019, tm_mon=4, tm_mday=1, tm_hour=1, tm_min=14, tm_sec=23, tm_wday=0, tm_yday=91, tm_isdst=0),
   'published': '2019-04-01T01:14:23Z',
   'published_parsed': time.struct_time(tm_year=2019, tm_mon=4, tm_mday=1, tm_hour=1, tm_min=14, tm_sec=23, tm_wday=0, tm_yday=91, tm_isdst=0),
   'title': 'Toward Real-World Single Image Super-Resolution: A New Benchmark and A\n  New Model',
   'title_detail': {'type': 'text/plain',
    'language': None,
    'base': 'http://export.arxiv.org/api/query?search_query=title%3ABlind+Super-Resolution+Kernel+Estimation+using+an+Internal-GAN+au%3Asefi_bell-kligler&id_list=&start=0&max_results=5&sortBy=relevance&sortOrder=descending',
    'value': 'Toward Real-World Single Image Super-Resolution: A New Benchmark and A\n  New Model'},
   'summary': 'Most of the existing learning-based single image

In [7]:
for i in tqdm(range(len(df))):
    if not isinstance(df.loc[i]["arxiv_id"], str):
        title = df.loc[i]["title"]
        authors = df.loc[i]["authors"].split(";")
        record, results, found = retrieve_arxiv(title, authors)
        if record is not None and found:
            df.loc[i]["arxiv_id"] = record["id"]

100%|██████████| 1430/1430 [19:52<00:00,  1.22it/s]


In [8]:
df.to_csv("nips2019.csv", index=False) # a little bit increased
print("Only {:.2f} % of NIPS 2019 papers available on arxiv by {}".format(len(df.dropna())/len(df)*100, datetime.now()))

Only 16.50 % of NIPS 2019 papers available on arxiv by 2019-09-13 10:07:19.064846
