In [26]:
import requests
from bs4 import BeautifulSoup
import asyncio
import aiohttp
import json
import re

In [2]:
archives = requests.get("https://arxiv.org/archive")
archives = [x[0] for x in archives.json()['archives']]
archives[:4]

['astro-ph', 'cond-mat', 'cs', 'econ']

In [3]:
articles = requests.get("https://arxiv.org/list/cs.AI/1501?show=1000")
articles = BeautifulSoup(articles.text, 'lxml')

  articles = BeautifulSoup(articles.text, 'lxml')


In [4]:
ids = articles.select('dt > span > a:nth-child(1)')
ids = [x['href'].split('/')[-1] for x in ids]
ids[:4]

['1501.00601', '1501.00653', '1501.01178', '1501.01239']

In [5]:
def get_data(id):
    article = requests.get(f'https://arxiv.org/abs/{id}')
    article = BeautifulSoup(article.text, 'lxml')

    # find block, remove 'Abstract:'
    abstract = article.select('#abs > blockquote')
    abstract = abstract[0].text.strip()[9:]

    scholar = requests.get(f"https://scholar.google.com/scholar_lookup?arxiv_id={id}")
    scholar = BeautifulSoup(scholar.text, 'lxml')

    # find link, remove 'Cited by'
    citations = scholar.select('#gs_res_ccl_mid > div > div.gs_ri > div.gs_fl.gs_flb > a:nth-child(3)')
    citations = int(citations[0].text.split(' ')[-1])

    return abstract, citations

In [33]:
async def get_abstract(id):
    async with aiohttp.ClientSession() as session:
        async with session.get(f'https://arxiv.org/abs/{id}') as response:
            article = BeautifulSoup(await response.text(), 'lxml')
            # find block, remove 'Abstract:'
            abstract = article.select('#abs > blockquote')
            abstract = abstract[0].text.strip()[9:]
    with open(f'data/a-{id}.json', 'w') as f:
        json.dump({'page': str(article), 'abstract': abstract}, f)
    return abstract

async def get_citations(id):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    async with aiohttp.ClientSession(trust_env=True, headers=headers) as session:
        async with session.get(f"https://scholar.google.com/scholar_lookup?arxiv_id={id}") as response:
            scholar = BeautifulSoup(await response.text(), 'lxml')
            
            # find link, remove 'Cited by'
            citations = scholar.select('#gs_res_ccl_mid > div > div.gs_ri > div.gs_fl.gs_flb > a:nth-child(3)')
            match = re.search(r'Cited by (\d+)', citations[0].text)
            citations = int(match.group(1)) if match else 0

    with open(f'data/s-{id}.json', 'w') as f:
        json.dump({'page': str(scholar), 'citations': citations}, f)
    return citations

async def get_data(id):
    abstract, citations = await asyncio.gather(get_abstract(id), get_citations(id))
    return abstract, citations

In [20]:
async def scrape(ids):
    tasks = [asyncio.create_task(get_data(id)) for id in ids]
    return await asyncio.gather(*tasks)

In [None]:
data = await scrape(ids)

In [17]:
with open("data/batch/1501.csv", "w") as f:
    for id, (abstract, citations) in zip(ids, data):
        f.write(f"{id},{citations},{abstract}\n")