-
Notifications
You must be signed in to change notification settings - Fork 14
/
scrape.py
executable file
·193 lines (148 loc) · 5.78 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python
import argparse
import asyncio
import functools
import re
from dataclasses import dataclass
import aiohttp
import bs4
import pandas as pd
from tqdm import tqdm
# Track total number of requests because all conferences are scraped in parallel
REQUESTS_PBAR: tqdm = None
# Restrict number of concurrent requests. If we would open all thousands of requests at
# once, some would inevitably time out at some point.
OPEN_REQUESTS: asyncio.Semaphore = None
def retry_on_server_disconnect(n_tries: int):
def decorator(f):
@functools.wraps(f)
async def wrapper(*args, **kwargs):
for i in range(n_tries):
try:
return await f(*args, **kwargs)
except aiohttp.client_exceptions.ClientConnectionError as e:
if i == n_tries - 1:
print("Client error, try again: {e}")
raise
return wrapper
return decorator
@retry_on_server_disconnect(3)
async def load_doc_from_url(session: aiohttp.ClientSession, url: str):
REQUESTS_PBAR.total += 1
async with OPEN_REQUESTS:
async with session.get(url) as response:
doc = bs4.BeautifulSoup(await response.text(), features="lxml")
REQUESTS_PBAR.update()
return doc
async def load_paper_ids(session: aiohttp.ClientSession, url):
doc = await load_doc_from_url(session, url)
cards = doc.select(".maincard.Poster")
return [c.attrs["id"][9:] for c in cards]
async def load_paper(session: aiohttp.ClientSession, url):
doc = await load_doc_from_url(session, url)
box = doc.select(".maincard")[0].parent
title = box.select(".maincardBody")[0].text.strip()
authors = [
(b.text.strip()[:-2].strip(), b.attrs["onclick"][13:-3])
for b in box.findAll("button")
]
return title, authors
async def load_author(session: aiohttp.ClientSession, url):
doc = await load_doc_from_url(session, url)
box = doc.select(".maincard")[0].parent
name = box.find("h3").text.strip()
affiliation = box.find("h4").text.strip()
return name, affiliation
@dataclass
class Conference:
name: str
host: str
first_year: int
def papers_url(self, year: int):
return f"https://{self.host}/Conferences/{year:d}/Schedule"
def paper_url(self, year: int, id: str):
return f"https://{self.host}/Conferences/{year:d}/Schedule?showEvent={id}"
def author_url(self, year: int, id: str):
return f"https://{self.host}/Conferences/{year:d}/Schedule?showSpeaker={id}"
async def scrape(self, year: int, session: aiohttp.ClientSession):
paper_ids = await load_paper_ids(session, self.papers_url(year))
paper_links = [self.paper_url(year, id) for id in paper_ids]
paper_tasks = [load_paper(session, link) for link in paper_links]
paper_data = await asyncio.gather(*paper_tasks)
author_ids = list(
set([id for _, authors in paper_data for name, id in authors])
)
author_links = [self.author_url(year, id) for id in author_ids]
author_tasks = [load_author(session, link) for link in author_links]
author_data = await asyncio.gather(*author_tasks)
affiliations = dict(author_data)
papers = [
(title, [(name, affiliations[name]) for name, _ in authors])
for title, authors in paper_data
]
unnormalized = [
(title, author, affiliation)
for title, authors in papers
for author, affiliation in authors
]
papers = pd.DataFrame(unnormalized, columns=["Title", "Author", "Affiliation"])
papers.insert(0, "Conference", self.name)
papers.insert(1, "Year", year)
return papers
CONFERENCES = [
Conference("ICML", "icml.cc", 2017),
Conference("NeurIPS", "neurips.cc", 2006),
Conference("ICLR", "iclr.cc", 2018),
]
async def main():
global REQUESTS_PBAR, OPEN_REQUESTS
parser = argparse.ArgumentParser(
description="Scrape paper data from ICML, NeurIPS and ICLR."
)
parser.add_argument(
"-o",
"--output",
default="papers.csv",
help="Where to store the data [Default: papers.csv]",
)
parser.add_argument(
"--parallel",
default=500,
type=int,
help="Number of parallel requests [Default: 500]",
)
parser.add_argument("years", help="Year or year range")
args = parser.parse_args()
output = args.output
parallel = args.parallel
years = args.years
OPEN_REQUESTS = asyncio.Semaphore(parallel)
if "-" in years:
match = re.match(r"^(\d+)-(\d+)$", years)
assert match, f"Invalid year range {years}; expected e.g. 2008-2010"
start, end = int(match[1]), int(match[2])
else:
start = end = int(years)
year_range = range(start, end + 1)
conferences = CONFERENCES
cf_names = ", ".join(c.name for c in conferences)
print(f"Scraping papers from {start}-{end} in {cf_names} into {output}")
with tqdm(total=0) as pbar:
REQUESTS_PBAR = pbar
async with aiohttp.ClientSession() as session:
paper_tasks = [
conf.scrape(year, session)
for conf in conferences
for year in year_range
if year >= conf.first_year
]
papers = await asyncio.gather(*paper_tasks)
df = pd.concat(papers)
# Sort rows by [Year, Conference] while keeping the authors in the original order
df = df.sort_values(by="Conference", kind="mergesort")
df = df.sort_values(by="Year", kind="mergesort")
# Fix multiple spaces in author names
df["Author"] = df["Author"].replace("\s+", " ", regex=True)
df.to_csv(output, index=False)
if __name__ == "__main__":
asyncio.run(main())