In [1]:
import urllib.request
import bs4 as bs

import datetime
import dateutil.parser
import time
from pathlib import Path
import re

### Each paper on arXive has assigned to it a single primary category, e.g. *cs* (Computer Science), *econ* (Economics), etc. We can retrieve the list of existing categories from the API.

In [2]:
categories_file = "categories.txt"

In [3]:
# The short raw xml meta-metadata contains info about the
# accesible arXiv categories of topics called 'sets'.
# You may go to the address Yourself.
# Lets dump it to the labels_file

if not Path(categories_file).is_file() :
    
    xml_query_labels = "http://export.arxiv.org/oai2?verb=ListSets"
    sauce = urllib.request.urlopen(xml_query_labels).read()

    with open(categories_file, "w") as dump:
        dump.write(str(sauce))

In [4]:
# Read from labels_file the raw xml.
# Create a dict {'cs': 'Computer Science', 'econ': 'Economics', etc}.

with open(categories_file) as readfrom:
    soup = bs.BeautifulSoup(readfrom.read(), 'lxml')

sets = soup.find_all("set")

categories = {}

for set_ in sets:
    categories[set_.setspec.string] = set_.setname.string

### The matter of categories is actually more messy. The primary categrization chosen by an author may be finer, e.g. *physics.astro-ph* (Physics: Astrophysics) instead of just *physics*. On top of that a paper may be assigned into secondary categories, again of potentially differing granulaties. A paper may for example have *econ.th* (Economics: Theoretical Economics) as its primary category, and *stat* as a secondary category without finer specification like *stat.me* (Statistics: Metodology), and *q-fin.ec* (Quantitative Finance: Economics) as an another secondary category.

### What we have retrieved is just the list of categories that can be used in the API queries. Apparently physics enthusiasts get more options.

In [5]:
categories

{'cs': 'Computer Science',
 'econ': 'Economics',
 'eess': 'Electrical Engineering and Systems Science',
 'math': 'Mathematics',
 'physics': 'Physics',
 'physics:astro-ph': 'Astrophysics',
 'physics:cond-mat': 'Condensed Matter',
 'physics:gr-qc': 'General Relativity and Quantum Cosmology',
 'physics:hep-ex': 'High Energy Physics - Experiment',
 'physics:hep-lat': 'High Energy Physics - Lattice',
 'physics:hep-ph': 'High Energy Physics - Phenomenology',
 'physics:hep-th': 'High Energy Physics - Theory',
 'physics:math-ph': 'Mathematical Physics',
 'physics:nlin': 'Nonlinear Sciences',
 'physics:nucl-ex': 'Nuclear Experiment',
 'physics:nucl-th': 'Nuclear Theory',
 'physics:physics': 'Physics (Other)',
 'physics:quant-ph': 'Quantum Physics',
 'q-bio': 'Quantitative Biology',
 'q-fin': 'Quantitative Finance',
 'stat': 'Statistics'}

### Lets exclude the *physics* as a single category.

In [16]:
cats = {cat: cat_name for (cat, cat_name) in categories.items() if cat != 'physics'}

### Say we wanna retrieve metadata of papers in Statistics published in May 2008.

In [21]:
# This function talks to You and tries to dump the xml into a local file
# It takes its sweet time because of poor timeout-handling
# Try e.g. harvest_data("2008-05-01","2008-06-01", 'stat')

def harvest_data(isoday_0, isoday_1, cat='all', by_days=1, overwrite_the_file=False, file_name=None, verbose=True) -> int :
    
    def verboseprint(text):
        if verbose:
            print(text)
    
    date_0 = dateutil.parser.parse(isoday_0).date()
    date_1 = dateutil.parser.parse(isoday_1).date()
    timestep  = datetime.timedelta(by_days-1)
    
    if date_1 < date_0:
        raise ValueError("Second date must be later than the first")
    
    if not file_name:
        # create a file with an overly descriptive name
        file = f"arXivMeta_{cat.replace(':','--')}_from_{date_0}_to_{date_1}.txt"
    else:
        file = file_name
        
    verboseprint(f"Writing to {file}")
    
    def clear_content(file) : # in place
        dump = open(file, 'w')
        dump.truncate(0)
        dump.close()

    if Path(file).is_file() :
        
        if overwrite_the_file :
            clear_content(file)
            verboseprint("The file was overwritten")
        else :
            verboseprint("The file already exists")
            return None
            
    else :
        verboseprint("The file will be created")


    
    def get_data_slice(date_from, date_until) -> int :
        
        records_count = -1
        
        if cat == 'all':
            xml_query= f"""
            http://export.arxiv.org/oai2?verb=ListRecords&from={date_from}&until={date_until}&metadataPrefix=arXiv
            """
        else:
            xml_query= f"""
            http://export.arxiv.org/oai2?verb=ListRecords&from={date_from}&until={date_until}&set={cat}&metadataPrefix=arXiv
            """
        
        def from_query_to_file() -> int :
            
            sauce = urllib.request.urlopen(xml_query).read()
            text = str(sauce)
            records = bs.BeautifulSoup(text, 'lxml').find_all("record")
            
            with open(file, "a") as dump: # 'append' mode
                dump.write(text)
                verboseprint(f"Dumped {cat} from {date_from} to {date_until}, {len(records)} records")
            
            return len(records)
        
        # try once
        try:
            records_count = from_query_to_file()

        except urllib.error.HTTPError:

            # wait and try the secod time
            time.sleep(10)
            try:
                records_count = from_query_to_file()

            except urllib.error.HTTPError:                
                verboseprint(f"Failed {cat} at date_from={date_from}, date_until={date_until}")

        return records_count

    
    # loop over dates and write data-slices to file
    date_from = date_0
    
    records = 0

    while date_from <= date_1:

        date_until = min(date_from + timestep, date_1)
        
        # get the data
        new_records = get_data_slice(date_from, date_until)

        if new_records == -1 :
            break
            
        records = records + new_records
        date_from = date_until + datetime.timedelta(days=1)

    
    verboseprint(f"Retrieved {records} records\n")

    return records

### There we go.  Seems like there was about one paper per day in statistics in the May of 2008

In [8]:
harvest_data("2008-05-01","2008-06-01", 'stat', by_days = 32, overwrite_the_file=True, file_name='test.txt')

Writing to test.txt
The file was overwritten
Dumped stat from 2008-05-01 to 2008-06-01, 34 records
Retrieved 34 records



34

### It's slow. The *by_days* argument controls by how big a chunks we are getting the data.  The bigger it is the faster we go, but when the number of records maxes out at 1000, it means that some of the data was not retrieved. Each chunk takes about a dozen seconds or more due to time-out.

### 'Economics' and ''Electrical Engineering and Systems Science' records begin only in 2017. Let's ignore them.

In [9]:
year=2016
for cat in ['econ', 'eess']:
    print(cat, f"in year {year}:",\
          harvest_data(f"{year}-01-01",f"{year}-12-31",\
                       cat, by_days = 370,\
                       overwrite_the_file=True, file_name='test.txt',\
                       verbose=False)
         )

econ in year 2016: 0
eess in year 2016: 0


In [20]:
del cats['econ']
del cats['eess']
cats

{'cs': 'Computer Science',
 'math': 'Mathematics',
 'physics:astro-ph': 'Astrophysics',
 'physics:cond-mat': 'Condensed Matter',
 'physics:gr-qc': 'General Relativity and Quantum Cosmology',
 'physics:hep-ex': 'High Energy Physics - Experiment',
 'physics:hep-lat': 'High Energy Physics - Lattice',
 'physics:hep-ph': 'High Energy Physics - Phenomenology',
 'physics:hep-th': 'High Energy Physics - Theory',
 'physics:math-ph': 'Mathematical Physics',
 'physics:nlin': 'Nonlinear Sciences',
 'physics:nucl-ex': 'Nuclear Experiment',
 'physics:nucl-th': 'Nuclear Theory',
 'physics:physics': 'Physics (Other)',
 'physics:quant-ph': 'Quantum Physics',
 'q-bio': 'Quantitative Biology',
 'q-fin': 'Quantitative Finance',
 'stat': 'Statistics'}

### Lets download the metadata one year at a time, one category at a time.

In [22]:
year = '2013'

for cat in cats:
    harvest_data(f"{year}-01-01", f"{year}-12-31", cat, by_days=8)

Writing to arXivMeta_cs_from_2013-01-01_to_2013-12-31.txt
The file already exists
Writing to arXivMeta_math_from_2013-01-01_to_2013-12-31.txt
The file already exists
Writing to arXivMeta_physics--astro-ph_from_2013-01-01_to_2013-12-31.txt
The file will be created
Dumped physics:astro-ph from 2013-01-01 to 2013-01-08, 106 records
Dumped physics:astro-ph from 2013-01-09 to 2013-01-16, 152 records
Dumped physics:astro-ph from 2013-01-17 to 2013-01-24, 139 records
Dumped physics:astro-ph from 2013-01-25 to 2013-02-01, 123 records
Dumped physics:astro-ph from 2013-02-02 to 2013-02-09, 107 records
Dumped physics:astro-ph from 2013-02-10 to 2013-02-17, 101 records
Dumped physics:astro-ph from 2013-02-18 to 2013-02-25, 121 records
Dumped physics:astro-ph from 2013-02-26 to 2013-03-05, 152 records
Dumped physics:astro-ph from 2013-03-06 to 2013-03-13, 170 records
Dumped physics:astro-ph from 2013-03-14 to 2013-03-21, 149 records
Dumped physics:astro-ph from 2013-03-22 to 2013-03-29, 112 records

Dumped physics:gr-qc from 2013-07-20 to 2013-07-27, 45 records
Dumped physics:gr-qc from 2013-07-28 to 2013-08-04, 39 records
Dumped physics:gr-qc from 2013-08-05 to 2013-08-12, 77 records
Dumped physics:gr-qc from 2013-08-13 to 2013-08-20, 56 records
Dumped physics:gr-qc from 2013-08-21 to 2013-08-28, 82 records
Dumped physics:gr-qc from 2013-08-29 to 2013-09-05, 62 records
Dumped physics:gr-qc from 2013-09-06 to 2013-09-13, 79 records
Dumped physics:gr-qc from 2013-09-14 to 2013-09-21, 63 records
Dumped physics:gr-qc from 2013-09-22 to 2013-09-29, 51 records
Dumped physics:gr-qc from 2013-09-30 to 2013-10-07, 69 records
Dumped physics:gr-qc from 2013-10-08 to 2013-10-15, 66 records
Dumped physics:gr-qc from 2013-10-16 to 2013-10-23, 49 records
Dumped physics:gr-qc from 2013-10-24 to 2013-10-31, 99 records
Dumped physics:gr-qc from 2013-11-01 to 2013-11-08, 68 records
Dumped physics:gr-qc from 2013-11-09 to 2013-11-16, 86 records
Dumped physics:gr-qc from 2013-11-17 to 2013-11-24, 75 

Dumped physics:hep-ph from 2013-03-22 to 2013-03-29, 111 records
Dumped physics:hep-ph from 2013-03-30 to 2013-04-06, 83 records
Dumped physics:hep-ph from 2013-04-07 to 2013-04-14, 90 records
Dumped physics:hep-ph from 2013-04-15 to 2013-04-22, 97 records
Dumped physics:hep-ph from 2013-04-23 to 2013-04-30, 90 records
Dumped physics:hep-ph from 2013-05-01 to 2013-05-08, 109 records
Dumped physics:hep-ph from 2013-05-09 to 2013-05-16, 100 records
Dumped physics:hep-ph from 2013-05-17 to 2013-05-24, 101 records
Dumped physics:hep-ph from 2013-05-25 to 2013-06-01, 1000 records
Dumped physics:hep-ph from 2013-06-02 to 2013-06-09, 81 records
Dumped physics:hep-ph from 2013-06-10 to 2013-06-17, 96 records
Dumped physics:hep-ph from 2013-06-18 to 2013-06-25, 91 records
Dumped physics:hep-ph from 2013-06-26 to 2013-07-03, 136 records
Dumped physics:hep-ph from 2013-07-04 to 2013-07-11, 90 records
Dumped physics:hep-ph from 2013-07-12 to 2013-07-19, 84 records
Dumped physics:hep-ph from 2013-0

In [23]:
year = '2014'

for cat in cats:
    harvest_data(f"{year}-01-01", f"{year}-12-31", cat, by_days=7)

Writing to arXivMeta_cs_from_2014-01-01_to_2014-12-31.txt
The file will be created
Failed cs at date_from=2014-01-01, date_until=2014-01-07
Retrieved 0 records

Writing to arXivMeta_math_from_2014-01-01_to_2014-12-31.txt
The file will be created
Failed math at date_from=2014-01-01, date_until=2014-01-07
Retrieved 0 records

Writing to arXivMeta_physics--astro-ph_from_2014-01-01_to_2014-12-31.txt
The file will be created
Failed physics:astro-ph at date_from=2014-01-01, date_until=2014-01-07
Retrieved 0 records

Writing to arXivMeta_physics--cond-mat_from_2014-01-01_to_2014-12-31.txt
The file will be created
Failed physics:cond-mat at date_from=2014-01-01, date_until=2014-01-07
Retrieved 0 records

Writing to arXivMeta_physics--gr-qc_from_2014-01-01_to_2014-12-31.txt
The file will be created
Failed physics:gr-qc at date_from=2014-01-01, date_until=2014-01-07
Retrieved 0 records

Writing to arXivMeta_physics--hep-ex_from_2014-01-01_to_2014-12-31.txt
The file will be created
Failed physics

KeyboardInterrupt: 

In [None]:
year = '2015'

for cat in cats:
    harvest_data(f"{year}-01-01", f"{year}-12-31", cat, by_days=6)

In [None]:
year = '2016'

for cat in cats:
    harvest_data(f"{year}-01-01", f"{year}-12-31", cat, by_days=5)

In [None]:
year = '2017'

for cat in cats:
    harvest_data(f"{year}-01-01", f"{year}-12-31", cat, by_days=4)