In [None]:
import time
import urllib.request
import urllib.error
import datetime
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup
import pandas as pd
import bibtexparser

pd.set_option('mode.chained_assignment','warn')

In [None]:
# The harvest() function utilizes arxiv.org's API to pull metadata for the specified data range.
# code mostly borrowed from http://betatim.github.io/posts/analysing-the-arxiv/

OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"

def harvest(startyear, endyear):
    ''' Uses arXiv's API to pull in all papers from between Jan 1, startyear to Jan 1, endyear''' 
    df = pd.DataFrame(columns=("title", "abstract", "categories", "created", "id", "doi"))
    base_url = "http://export.arxiv.org/oai2?verb=ListRecords&"
    url = (base_url +
           "from=" + str(startyear) + "-01-01&until=" + str(endyear) + "-01-01&" +
           "metadataPrefix=arXiv&set=math")
    
    while True:
        print ("fetching", url)
        try:
            response = urllib.request.urlopen(url)
            
        except urllib.error.HTTPError as e:
            if e.code == 503:
                to = int(e.hdrs.get("retry-after", 30))
                print ("Got 503. Retrying after {0:d} seconds.".format(to))

                time.sleep(to)
                continue
                
            else:
                raise
            
        xml = response.read()

        root = ET.fromstring(xml)

        for record in root.find(OAI+'ListRecords').findall(OAI+"record"):
            try:
                arxiv_id = record.find(OAI+'header').find(OAI+'identifier')
                meta = record.find(OAI+'metadata')
                info = meta.find(ARXIV+"arXiv")
                created = info.find(ARXIV+"created").text
                created = datetime.datetime.strptime(created, "%Y-%m-%d")
                categories = info.find(ARXIV+"categories").text

                # if there is more than one DOI use the first one
                # often the second one (if it exists at all) refers
                # to an eratum or similar
                doi = info.find(ARXIV+"doi")
                if doi is not None:
                    doi = doi.text.split()[0]

                contents = {'title': info.find(ARXIV+"title").text,
                            'id': info.find(ARXIV+"id").text,#arxiv_id.text[4:],
                            'abstract': info.find(ARXIV+"abstract").text.strip(),
                            'created': created,
                            'categories': categories.split(),
                            'doi': doi,
                            }

                df = df.append(contents, ignore_index=True)
            except: pass

        # The list of articles returned by the API comes in chunks of
        # 1000 articles. The presence of a resumptionToken tells us that
        # there is more to be fetched.
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break

        else:
            url = base_url + "resumptionToken=%s"%(token.text)
            
    return df

In [None]:
# harvest all papers from 2007 until 2018
# Export each to csv file

for year in [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]:
    df = harvest(year, year+1)
    df.to_csv('../data/arxiv_math_' + str(year) + '.csv', index = False)

In [None]:
# Combine all the above dataframes into one and export to csv

arxiv_math = pd.DataFrame()
for year in [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]:
    df = pd.read_csv('../data/arxiv_math_' + str(year) + '.csv')
    arxiv_math = arxiv_math.append(df, ignore_index = True)
    
arxiv_math.to_csv('../data/arxiv_math', index = False)