In [1]:
import urllib.request
import bs4 as bs
import time
import datetime
import dateutil.parser
import csv
import re
import os
from pathlib import Path

---

# arXiv metadata harvester
arXiv is an online repository of scientific pre-prints, see https://arxiv.org/help/general
## Grab records from the requested timespan, from all or from one selected category

## Write to tab-delimited local csv:
## columns: *id, primary_category, sub_categories, title, abstract*
(Dealing with funny characters in the names of authors was beyond me. One could also get a date associated with each record but it's supposed not to necessarily correspond to the date of posting by the authors.)
### There are two functions; both will talk to You using prints:
* *harvest_slice* needs You to explicitly choose the category (possibly 'all') and the filename as arguments
    * just appends lines to the file, it's up to You not to make a mess
    
* *harvest_data* divides the timespan into slices of given length (367 days by default) and harvests those using *harvest_slice*:
    * can make up the name of the file on its own
    * adds the header to the csv
    * default behavior when the file already exists is to quit
    * default category is 'all'

### It is slow.    
### Examples:
*  1 min,  2 MB >>> harvest_slice("2018-10-01", "2018-10-10", "math", "test.csv")
*  4 min, 11 MB >>> harvest_slice("2018-10-01", "2018-10-10", "all", "test.csv")
* 8 min, 16 MB >>> harvest_data("2018-08-01", "2018-11-01", category="math", file_name = "test.csv", overwrite=True)
* 30 min, 68 MB >>> harvest_data("2018-08-01", "2018-11-01")

### Example of a basic query used in the code:
* http://export.arxiv.org/oai2?verb=ListRecords&from=2012-01-01&until=2018-02-01&set=physics:hep-th&metadataPrefix=arXiv
* "http://export.arxiv.org/oai2?verb=ListSets"

See https://arxiv.org/help/bulk_data for more info.

---

### Each paper on arXive has assigned to it a single primary category, e.g. *cs* (Computer Science), *econ* (Economics), etc. We can retrieve the list of existing categories from the API.

In [2]:
# The query retrieves short xml meta-metadata containing info
# about the accesible arXiv categories of topics called 'sets', e.g.
# <set>
# <setspec>cs</setspec>
# <setname>Computer Science</setname>
# </set>


if not Path("categories.txt").is_file() :
    
    xml_query = "http://export.arxiv.org/oai2?verb=ListSets"
    sauce = urllib.request.urlopen(xml_query).read()
    soup = bs.BeautifulSoup(sauce, 'lxml')
    sets = soup.find_all("set")

    categories = {}

    for set_ in sets:
        categories[set_.setspec.string] = set_.setname.string

    with open("categories.txt", "w") as file:
        for category, description in categories.items():
            file.write("".join([category,"\t" , description, "\n"]))
            

categories = {}
with open("categories.txt", "r") as file:
    for line in file:
        category, description = line.rstrip('\n').split('\t')
        categories[category] = description

categories

{'cs': 'Computer Science',
 'econ': 'Economics',
 'eess': 'Electrical Engineering and Systems Science',
 'math': 'Mathematics',
 'physics': 'Physics',
 'physics:astro-ph': 'Astrophysics',
 'physics:cond-mat': 'Condensed Matter',
 'physics:gr-qc': 'General Relativity and Quantum Cosmology',
 'physics:hep-ex': 'High Energy Physics - Experiment',
 'physics:hep-lat': 'High Energy Physics - Lattice',
 'physics:hep-ph': 'High Energy Physics - Phenomenology',
 'physics:hep-th': 'High Energy Physics - Theory',
 'physics:math-ph': 'Mathematical Physics',
 'physics:nlin': 'Nonlinear Sciences',
 'physics:nucl-ex': 'Nuclear Experiment',
 'physics:nucl-th': 'Nuclear Theory',
 'physics:physics': 'Physics (Other)',
 'physics:quant-ph': 'Quantum Physics',
 'q-bio': 'Quantitative Biology',
 'q-fin': 'Quantitative Finance',
 'stat': 'Statistics'}

### The matter of categories is actually more messy. Physics gets an additional level of gradation: e.g. *physics:astro-ph* is a subset of *physics*. And in general categorizations chosen by an author may be finer, e.g. *cs.ai* (Computer Science: Artificial Intelligence) instead of just *cs*, *physics:astro-ph.GA* (Physics: Astrophysics: Astrophysics of Galaxies) instead of just *physics*.

### What You see above are just names of *sets* that can be used in the API queries. Apparently physics enthusiasts get more options. Also *Economics* and *Electrical Engineering* start only in 2017 (I've checked).

### Lets treat *physics* as a single category for consistency, and let's exclude *econ* and *eess*.

In [3]:
cats = {cat: cat_name for (cat, cat_name) in categories.items() if cat not in  ['econ' ,'eess'] and not re.match(r'physics:.+', cat)}
cats

{'cs': 'Computer Science',
 'math': 'Mathematics',
 'physics': 'Physics',
 'q-bio': 'Quantitative Biology',
 'q-fin': 'Quantitative Finance',
 'stat': 'Statistics'}

---

In [3]:
def harvest_slice(date_from, date_until, category, file) -> int:
    # returns -1 if unsuccesful
    # returns number of downloaded records if succesful
    
    base_query = "http://export.arxiv.org/oai2?verb=ListRecords"
    
    if category == "all":
        query = base_query + f"&from={date_from}&until={date_until}&metadataPrefix=arXiv"
    else:
        query = base_query + f"&from={date_from}&until={date_until}&set={category}&metadataPrefix=arXiv"
    
    retrieved = 0
    
    while query:
        
        time_0 = time.time()
        
        # try to download
        try:            
            sauce = urllib.request.urlopen(query).read()

        except:
            print(f"Failed at: {category} from {date_from} until {date_until} requesting {query}\n")
            return -1
        
        # parse the xml looking for <record>'s
        soup = bs.BeautifulSoup(sauce, 'lxml')
        records = soup.find_all('record')

        retrieved = retrieved + len(records)

        with open(file, "a") as dump:

            writer = csv.writer(dump, delimiter='\t')
            for record in records:                
                record_string = [record.id.string,
                                 record.setspec.string,
                                 record.categories.string,
                                 record.title.string,
                                 record.abstract.string
                                ]
                writer.writerow(record_string)
        
        if len(records) == 0:
            print("".join([category," from ", f"{date_from}"," until ", f"{date_until}"," empty"]))
            break
        
        # info at the end of 'soup' about where to resume if the data stream was cut at 1000 records
        # None if the stream wasn't cut
        res_token = soup.find("resumptiontoken")
        
        if res_token:

            # data in the current loop started at this record in the 'query'
            started_at = int(res_token['cursor']) + 1
            
            # total number of records in the 'query', should be the same in each loop
            all_to_retrieve = int(res_token['completelistsize'])
            
            if res_token.string:
                # the identifier that allows to resume the query
                # None if the slice was completed

                query = base_query + f"&resumptionToken={res_token.string}"
                time.sleep(10)
            else:
                query = None
            
        else:
            started_at = 1 
            all_to_retrieve = len(records)
            query = None
        
        time_1 = time.time()
        
        print("".join([category,
                       " from ", f"{date_from}", " until ", f"{date_until}",
                       f" ({started_at:>5}-{started_at+len(records)-1:>5})/{all_to_retrieve:>5}",
                      " in ", f"{(time_1 - time_0):3.2f}", "s"]) )
    
    # end of while loop
     
    return retrieved


In [5]:
harvest_slice("2018-10-01", "2018-10-10", "math", "test.csv")

math from 2018-10-01 until 2018-10-10 (    1- 1000)/ 2358 in 22.93s
math from 2018-10-01 until 2018-10-10 ( 1001- 2000)/ 2358 in 25.32s
math from 2018-10-01 until 2018-10-10 ( 2001- 2358)/ 2358 in 5.24s


2358

In [34]:
harvest_slice("2018-10-01", "2018-10-10", "all", "test.csv")

all from 2018-10-01 until 2018-10-10 (    1- 1000)/ 7663 in 32.15s
all from 2018-10-01 until 2018-10-10 ( 1001- 2000)/ 7663 in 25.72s
all from 2018-10-01 until 2018-10-10 ( 2001- 3000)/ 7663 in 26.29s
all from 2018-10-01 until 2018-10-10 ( 3001- 4000)/ 7663 in 25.64s
all from 2018-10-01 until 2018-10-10 ( 4001- 5000)/ 7663 in 24.63s
all from 2018-10-01 until 2018-10-10 ( 5001- 6000)/ 7663 in 24.01s
all from 2018-10-01 until 2018-10-10 ( 6001- 7000)/ 7663 in 25.05s
all from 2018-10-01 until 2018-10-10 ( 7001- 7663)/ 7663 in 10.29s


7663

In [5]:
# Wrapper around harvest_slice
# * handles file-names
# * slices the time period of papers into intervals of given number of days (367 by default)
# (to avoid maxing-out the response from server)

def harvest_data(isoday_0, isoday_1, category='all', days_in_slice = 367, file_name=None, overwrite=False) -> int:

    date_0 = dateutil.parser.parse(isoday_0).date()
    date_1 = dateutil.parser.parse(isoday_1).date()

    if not file_name:
        # create a file with an overly descriptive name
        file = f"arXivMeta_{category.replace(':','--')}_from_{date_0}_to_{date_1}.csv"
    else:
        file = file_name
    
    # check if file already exists
    if Path(file).is_file():
        if overwrite :

            # try to backup the old file
            file_info = re.match(r"(\w.+)\.(\w\w+)", file)
            if file_info:
                new_file = "".join([ file_info.group(1), "_bak.", file_info.group(2) ])
                if not Path(new_file).is_file():
                    os.rename(file, new_file)
                    print(f"Old file backed up as {new_file}")

            # clear the file
            print(f"Overwriting {file}\n")
            with open(file, "w") as dump:
                dump.truncate(0)
            
        else:
            print(f"The file {file} already exists")
            return -1
    
    else:
        print(f"Writing to {file}\n")
    
    with open(file, "a") as dump:
            writer = csv.writer(dump, delimiter='\t')
            header = ['id', 'prim_cat', 'sec_cats', 'title', 'abstract']
            writer.writerow(header)
    
    # Star the clock
    time_0 = time.time()
    
    # Let's count all downloaded records
    retrieved = 0
    
    # We'll go from 'date_0' until 'date_1' in slices of 'days_in_slice' days
    # The server's response presumably maxes out at some number of records,
    # so we hope to have slices with less records than that.

    date_from = date_0

    while date_from <= date_1:
        
        date_until = min(date_1, date_from + datetime.timedelta(days_in_slice-1))

        # try to download the slice
        newly_retrieved = harvest_slice(date_from, date_until, category, file)
        
        if newly_retrieved == -1:
            break
        
        retrieved = retrieved + newly_retrieved

        # move on to the next slice
        date_from = date_until + datetime.timedelta(days=1)
        
        # time-out
        time.sleep(10)

    time_1 = time.time()
    
    print("".join([category,
                   " from ", str(date_0), " until ", str(date_1),
                   " retrieved ", str(retrieved), " records"
                   ," in ", f"{(time_1 - time_0)/60:.0f}", " min\n"])
         )
    
    return retrieved


In [140]:
harvest_data("2018-08-01", "2018-11-01", category="math", file_name = "test.csv", overwrite=True)

Writing to test.csv

math from 2018-08-01 until 2018-11-01 (    1- 1000)/18739 in 26.22s
math from 2018-08-01 until 2018-11-01 ( 1001- 2000)/18739 in 26.94s
math from 2018-08-01 until 2018-11-01 ( 2001- 3000)/18739 in 27.15s
math from 2018-08-01 until 2018-11-01 ( 3001- 4000)/18739 in 24.76s
math from 2018-08-01 until 2018-11-01 ( 4001- 5000)/18739 in 26.61s
math from 2018-08-01 until 2018-11-01 ( 5001- 6000)/18739 in 24.70s
math from 2018-08-01 until 2018-11-01 ( 6001- 7000)/18739 in 24.08s
math from 2018-08-01 until 2018-11-01 ( 7001- 8000)/18739 in 26.17s
math from 2018-08-01 until 2018-11-01 ( 8001- 9000)/18739 in 23.18s
math from 2018-08-01 until 2018-11-01 ( 9001-10000)/18739 in 24.53s
math from 2018-08-01 until 2018-11-01 (10001-11000)/18739 in 24.96s
math from 2018-08-01 until 2018-11-01 (11001-12000)/18739 in 23.93s
math from 2018-08-01 until 2018-11-01 (12001-13000)/18739 in 23.57s
math from 2018-08-01 until 2018-11-01 (13001-14000)/18739 in 22.86s
math from 2018-08-01 until 

18739

In [142]:
harvest_data("2018-08-01", "2018-11-01")

Writing to arXivMeta_all_from_2018-08-01_to_2018-11-01.csv

all from 2018-08-01 until 2018-11-01 (    1- 1000)/61276 in 26.90s
all from 2018-08-01 until 2018-11-01 ( 1001- 2000)/61276 in 27.15s
all from 2018-08-01 until 2018-11-01 ( 2001- 3000)/61276 in 27.79s
all from 2018-08-01 until 2018-11-01 ( 3001- 4000)/61276 in 28.60s
all from 2018-08-01 until 2018-11-01 ( 4001- 5000)/61276 in 41.76s
all from 2018-08-01 until 2018-11-01 ( 5001- 6000)/61276 in 28.32s
all from 2018-08-01 until 2018-11-01 ( 6001- 7000)/61276 in 26.41s
all from 2018-08-01 until 2018-11-01 ( 7001- 8000)/61276 in 27.18s
all from 2018-08-01 until 2018-11-01 ( 8001- 9000)/61276 in 30.19s
all from 2018-08-01 until 2018-11-01 ( 9001-10000)/61276 in 29.14s
all from 2018-08-01 until 2018-11-01 (10001-11000)/61276 in 30.14s
all from 2018-08-01 until 2018-11-01 (11001-12000)/61276 in 27.50s
all from 2018-08-01 until 2018-11-01 (12001-13000)/61276 in 26.80s
all from 2018-08-01 until 2018-11-01 (13001-14000)/61276 in 72.56s
al

61276

In [156]:
cats

{'cs': 'Computer Science',
 'math': 'Mathematics',
 'physics': 'Physics',
 'q-bio': 'Quantitative Biology',
 'q-fin': 'Quantitative Finance',
 'stat': 'Statistics'}

In [5]:
year = 2017
cat = 'stat'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_stat_from_2017-01-01_to_2017-12-31.csv

stat from 2017-01-01 until 2017-12-31 (    1- 1000)/ 7533 in 61.89s
stat from 2017-01-01 until 2017-12-31 ( 1001- 2000)/ 7533 in 168.69s
stat from 2017-01-01 until 2017-12-31 ( 2001- 3000)/ 7533 in 52.54s
stat from 2017-01-01 until 2017-12-31 ( 3001- 4000)/ 7533 in 75.94s
stat from 2017-01-01 until 2017-12-31 ( 4001- 5000)/ 7533 in 96.69s
stat from 2017-01-01 until 2017-12-31 ( 5001- 6000)/ 7533 in 119.96s
stat from 2017-01-01 until 2017-12-31 ( 6001- 7000)/ 7533 in 253.64s
stat from 2017-01-01 until 2017-12-31 ( 7001- 7533)/ 7533 in 8.77s
stat from 2017-01-01 until 2017-12-31 retrieved 7533 records in 14 min



7533

In [7]:
year = 2016
cat = 'stat'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_stat_from_2016-01-01_to_2016-12-31.csv

stat from 2016-01-01 until 2016-12-31 (    1- 1000)/ 5653 in 37.66s
stat from 2016-01-01 until 2016-12-31 ( 1001- 2000)/ 5653 in 26.14s
stat from 2016-01-01 until 2016-12-31 ( 2001- 3000)/ 5653 in 24.50s
stat from 2016-01-01 until 2016-12-31 ( 3001- 4000)/ 5653 in 24.20s
stat from 2016-01-01 until 2016-12-31 ( 4001- 5000)/ 5653 in 24.21s
stat from 2016-01-01 until 2016-12-31 ( 5001- 5653)/ 5653 in 10.78s
stat from 2016-01-01 until 2016-12-31 retrieved 5653 records in 3 min


5653

In [158]:
year = 2015
cat = 'stat'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_stat_from_2015-01-01_to_2015-12-31.csv

stat from 2015-01-01 until 2015-12-31 (    1- 1000)/ 4847 in 27.61s
stat from 2015-01-01 until 2015-12-31 ( 1001- 2000)/ 4847 in 27.32s
stat from 2015-01-01 until 2015-12-31 ( 2001- 3000)/ 4847 in 25.01s
stat from 2015-01-01 until 2015-12-31 ( 3001- 4000)/ 4847 in 25.59s
stat from 2015-01-01 until 2015-12-31 ( 4001- 4847)/ 4847 in 13.94s
stat from 2015-01-01 until 2015-12-31 retrieved 4847 records in 2 min


4847

In [168]:
year0 = 2012
year1 = 2014
cat = 'stat'
harvest_data(f"{year0}-01-01", f"{year1}-12-31", category=cat, overwrite=True)

Old file backed up as arXivMeta_stat_from_2012-01-01_to_2014-12-31_bak.csv
Overwriting arXivMeta_stat_from_2012-01-01_to_2014-12-31.csv

Failed at: stat from 2012-01-01 until 2012-12-31 requesting http://export.arxiv.org/oai2?verb=ListRecords&from=2012-01-01&until=2012-12-31&set=stat&metadataPrefix=arXiv

stat from 2012-01-01 until 2014-12-31 retrieved 0 records in 0 min


0

In [6]:
year = 2017
cat = 'q-fin'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_q-fin_from_2017-01-01_to_2017-12-31.csv

q-fin from 2017-01-01 until 2017-12-31 (    1- 1000)/ 1001 in 23.46s
q-fin from 2017-01-01 until 2017-12-31 ( 1001- 1001)/ 1001 in 1.74s
q-fin from 2017-01-01 until 2017-12-31 retrieved 1001 records in 1 min



1001

In [8]:
year = 2016
cat = 'q-fin'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_q-fin_from_2016-01-01_to_2016-12-31.csv

q-fin from 2016-01-01 until 2016-12-31 (    1- 1000)/ 1011 in 23.54s
q-fin from 2016-01-01 until 2016-12-31 ( 1001- 1011)/ 1011 in 2.07s
q-fin from 2016-01-01 until 2016-12-31 retrieved 1011 records in 1 min


1011

In [159]:
year = 2015
cat = 'q-fin'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_q-fin_from_2015-01-01_to_2015-12-31.csv

q-fin from 2015-01-01 until 2015-12-31 (    1- 1000)/ 1118 in 24.65s
q-fin from 2015-01-01 until 2015-12-31 ( 1001- 1118)/ 1118 in 3.77s
q-fin from 2015-01-01 until 2015-12-31 retrieved 1118 records in 1 min


1118

In [169]:
year0 = 2012
year1 = 2014
cat = 'q-fin'
harvest_data(f"{year0}-01-01", f"{year1}-12-31", category=cat, overwrite=True)

Old file backed up as arXivMeta_q-fin_from_2012-01-01_to_2014-12-31_bak.csv
Overwriting arXivMeta_q-fin_from_2012-01-01_to_2014-12-31.csv

Failed at: q-fin from 2012-01-01 until 2012-12-31 requesting http://export.arxiv.org/oai2?verb=ListRecords&from=2012-01-01&until=2012-12-31&set=q-fin&metadataPrefix=arXiv

q-fin from 2012-01-01 until 2014-12-31 retrieved 0 records in 0 min


0

In [7]:
year = 2017
cat = 'q-bio'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_q-bio_from_2017-01-01_to_2017-12-31.csv

q-bio from 2017-01-01 until 2017-12-31 (    1- 1000)/ 2745 in 27.95s
q-bio from 2017-01-01 until 2017-12-31 ( 1001- 2000)/ 2745 in 30.29s
q-bio from 2017-01-01 until 2017-12-31 ( 2001- 2745)/ 2745 in 45.23s
q-bio from 2017-01-01 until 2017-12-31 retrieved 2745 records in 2 min



2745

In [9]:
year = 2016
cat = 'q-bio'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_q-bio_from_2016-01-01_to_2016-12-31.csv

q-bio from 2016-01-01 until 2016-12-31 (    1- 1000)/ 2848 in 24.75s
q-bio from 2016-01-01 until 2016-12-31 ( 1001- 2000)/ 2848 in 25.89s
q-bio from 2016-01-01 until 2016-12-31 ( 2001- 2848)/ 2848 in 12.75s
q-bio from 2016-01-01 until 2016-12-31 retrieved 2848 records in 1 min


2848

In [160]:
year = 2015
cat = 'q-bio'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_q-bio_from_2015-01-01_to_2015-12-31.csv

q-bio from 2015-01-01 until 2015-12-31 (    1- 1000)/ 3633 in 27.92s
q-bio from 2015-01-01 until 2015-12-31 ( 1001- 2000)/ 3633 in 27.71s
q-bio from 2015-01-01 until 2015-12-31 ( 2001- 3000)/ 3633 in 25.48s
q-bio from 2015-01-01 until 2015-12-31 ( 3001- 3633)/ 3633 in 11.02s
q-bio from 2015-01-01 until 2015-12-31 retrieved 3633 records in 2 min


3633

In [166]:
year0 = 2012
year1 = 2014
cat = 'q-bio'
harvest_data(f"{year0}-01-01", f"{year1}-12-31", category=cat)

Writing to arXivMeta_q-bio_from_2012-01-01_to_2014-12-31.csv

Failed at: q-bio from 2012-01-01 until 2012-12-31 requesting http://export.arxiv.org/oai2?verb=ListRecords&from=2012-01-01&until=2012-12-31&set=q-bio&metadataPrefix=arXiv

q-bio from 2012-01-01 until 2014-12-31 retrieved 0 records in 0 min


0

In [8]:
year = 2017
cat = 'cs'
harvest_data(f"{year}-01-01", f"{year}-07-01", category=cat)
harvest_data(f"{year}-07-02", f"{year}-12-31", category=cat)

Writing to arXivMeta_cs_from_2017-01-01_to_2017-07-01.csv

cs from 2017-01-01 until 2017-07-01 (    1- 1000)/14336 in 65.01s
cs from 2017-01-01 until 2017-07-01 ( 1001- 2000)/14336 in 59.40s
cs from 2017-01-01 until 2017-07-01 ( 2001- 3000)/14336 in 50.41s
cs from 2017-01-01 until 2017-07-01 ( 3001- 4000)/14336 in 64.74s
cs from 2017-01-01 until 2017-07-01 ( 4001- 5000)/14336 in 81.46s
cs from 2017-01-01 until 2017-07-01 ( 5001- 6000)/14336 in 67.16s
cs from 2017-01-01 until 2017-07-01 ( 6001- 7000)/14336 in 52.28s
cs from 2017-01-01 until 2017-07-01 ( 7001- 8000)/14336 in 67.22s
cs from 2017-01-01 until 2017-07-01 ( 8001- 9000)/14336 in 97.56s
cs from 2017-01-01 until 2017-07-01 ( 9001-10000)/14336 in 80.68s
cs from 2017-01-01 until 2017-07-01 (10001-11000)/14336 in 139.89s
cs from 2017-01-01 until 2017-07-01 (11001-12000)/14336 in 67.61s
cs from 2017-01-01 until 2017-07-01 (12001-13000)/14336 in 60.01s
cs from 2017-01-01 until 2017-07-01 (13001-14000)/14336 in 25.18s
cs from 2017-01-

16447

In [10]:
year = 2016
cat = 'cs'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_cs_from_2016-01-01_to_2016-12-31.csv

cs from 2016-01-01 until 2016-12-31 (    1- 1000)/26365 in 26.32s
cs from 2016-01-01 until 2016-12-31 ( 1001- 2000)/26365 in 26.68s
cs from 2016-01-01 until 2016-12-31 ( 2001- 3000)/26365 in 28.45s
cs from 2016-01-01 until 2016-12-31 ( 3001- 4000)/26365 in 26.20s
cs from 2016-01-01 until 2016-12-31 ( 4001- 5000)/26365 in 24.70s
cs from 2016-01-01 until 2016-12-31 ( 5001- 6000)/26365 in 24.03s
cs from 2016-01-01 until 2016-12-31 ( 6001- 7000)/26365 in 23.86s
cs from 2016-01-01 until 2016-12-31 ( 7001- 8000)/26365 in 23.83s
cs from 2016-01-01 until 2016-12-31 ( 8001- 9000)/26365 in 23.12s
cs from 2016-01-01 until 2016-12-31 ( 9001-10000)/26365 in 26.83s
cs from 2016-01-01 until 2016-12-31 (10001-11000)/26365 in 24.90s
cs from 2016-01-01 until 2016-12-31 (11001-12000)/26365 in 50.23s
cs from 2016-01-01 until 2016-12-31 (12001-13000)/26365 in 24.65s
cs from 2016-01-01 until 2016-12-31 (13001-14000)/26365 in 24.53s
cs from 2016-01-0

26365

In [161]:
year = 2015
cat = 'cs'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_cs_from_2015-01-01_to_2015-12-31.csv

cs from 2015-01-01 until 2015-12-31 (    1- 1000)/20442 in 28.37s
cs from 2015-01-01 until 2015-12-31 ( 1001- 2000)/20442 in 27.26s
cs from 2015-01-01 until 2015-12-31 ( 2001- 3000)/20442 in 26.66s
cs from 2015-01-01 until 2015-12-31 ( 3001- 4000)/20442 in 33.14s
cs from 2015-01-01 until 2015-12-31 ( 4001- 5000)/20442 in 27.91s
cs from 2015-01-01 until 2015-12-31 ( 5001- 6000)/20442 in 25.80s
cs from 2015-01-01 until 2015-12-31 ( 6001- 7000)/20442 in 26.08s
cs from 2015-01-01 until 2015-12-31 ( 7001- 8000)/20442 in 24.76s
cs from 2015-01-01 until 2015-12-31 ( 8001- 9000)/20442 in 25.76s
cs from 2015-01-01 until 2015-12-31 ( 9001-10000)/20442 in 26.13s
cs from 2015-01-01 until 2015-12-31 (10001-11000)/20442 in 25.30s
cs from 2015-01-01 until 2015-12-31 (11001-12000)/20442 in 25.18s
cs from 2015-01-01 until 2015-12-31 (12001-13000)/20442 in 24.68s
cs from 2015-01-01 until 2015-12-31 (13001-14000)/20442 in 24.82s
cs from 2015-01-0

20442

In [9]:
year = 2017
cat = 'math'
harvest_data(f"{year}-01-01", f"{year}-05-01", category=cat)
harvest_data(f"{year}-05-02", f"{year}-09-01", category=cat)
harvest_data(f"{year}-09-02", f"{year}-12-31", category=cat)

Writing to arXivMeta_math_from_2017-01-01_to_2017-05-01.csv

math from 2017-01-01 until 2017-05-01 (    1- 1000)/14122 in 51.04s
math from 2017-01-01 until 2017-05-01 ( 1001- 2000)/14122 in 167.79s
math from 2017-01-01 until 2017-05-01 ( 2001- 3000)/14122 in 169.52s
math from 2017-01-01 until 2017-05-01 ( 3001- 4000)/14122 in 117.55s
math from 2017-01-01 until 2017-05-01 ( 4001- 5000)/14122 in 116.51s
math from 2017-01-01 until 2017-05-01 ( 5001- 6000)/14122 in 24.90s
math from 2017-01-01 until 2017-05-01 ( 6001- 7000)/14122 in 29.22s
math from 2017-01-01 until 2017-05-01 ( 7001- 8000)/14122 in 26.21s
math from 2017-01-01 until 2017-05-01 ( 8001- 9000)/14122 in 24.00s
math from 2017-01-01 until 2017-05-01 ( 9001-10000)/14122 in 23.72s
math from 2017-01-01 until 2017-05-01 (10001-11000)/14122 in 25.60s
math from 2017-01-01 until 2017-05-01 (11001-12000)/14122 in 26.83s
math from 2017-01-01 until 2017-05-01 (12001-13000)/14122 in 23.24s
math from 2017-01-01 until 2017-05-01 (13001-14000)

15922

In [11]:
year = 2016
cat = 'math'
harvest_data(f"{year}-01-01", f"{year}-07-01", category=cat)
harvest_data(f"{year}-07-02", f"{year}-12-31", category=cat)

Writing to arXivMeta_math_from_2016-01-01_to_2016-07-01.csv

math from 2016-01-01 until 2016-07-01 (    1- 1000)/19710 in 115.82s
math from 2016-01-01 until 2016-07-01 ( 1001- 2000)/19710 in 28.05s
math from 2016-01-01 until 2016-07-01 ( 2001- 3000)/19710 in 26.58s
math from 2016-01-01 until 2016-07-01 ( 3001- 4000)/19710 in 30.06s
math from 2016-01-01 until 2016-07-01 ( 4001- 5000)/19710 in 29.94s
math from 2016-01-01 until 2016-07-01 ( 5001- 6000)/19710 in 30.71s
math from 2016-01-01 until 2016-07-01 ( 6001- 7000)/19710 in 24.87s
math from 2016-01-01 until 2016-07-01 ( 7001- 8000)/19710 in 29.16s
math from 2016-01-01 until 2016-07-01 ( 8001- 9000)/19710 in 24.72s
math from 2016-01-01 until 2016-07-01 ( 9001-10000)/19710 in 61.42s
math from 2016-01-01 until 2016-07-01 (10001-11000)/19710 in 65.89s
math from 2016-01-01 until 2016-07-01 (11001-12000)/19710 in 80.26s
math from 2016-01-01 until 2016-07-01 (12001-13000)/19710 in 46.01s
math from 2016-01-01 until 2016-07-01 (13001-14000)/19

25240

In [162]:
year = 2015
cat = 'math'
harvest_data(f"{year}-01-01", f"{year}-12-31", category=cat)

Writing to arXivMeta_math_from_2015-01-01_to_2015-12-31.csv

math from 2015-01-01 until 2015-12-31 (    1- 1000)/45052 in 27.74s
math from 2015-01-01 until 2015-12-31 ( 1001- 2000)/45052 in 25.26s
math from 2015-01-01 until 2015-12-31 ( 2001- 3000)/45052 in 23.18s
math from 2015-01-01 until 2015-12-31 ( 3001- 4000)/45052 in 24.57s
math from 2015-01-01 until 2015-12-31 ( 4001- 5000)/45052 in 25.77s
math from 2015-01-01 until 2015-12-31 ( 5001- 6000)/45052 in 26.72s
math from 2015-01-01 until 2015-12-31 ( 6001- 7000)/45052 in 24.42s
math from 2015-01-01 until 2015-12-31 ( 7001- 8000)/45052 in 25.21s
math from 2015-01-01 until 2015-12-31 ( 8001- 9000)/45052 in 24.86s
math from 2015-01-01 until 2015-12-31 ( 9001-10000)/45052 in 26.47s
math from 2015-01-01 until 2015-12-31 (10001-11000)/45052 in 26.96s
math from 2015-01-01 until 2015-12-31 (11001-12000)/45052 in 26.23s
math from 2015-01-01 until 2015-12-31 (12001-13000)/45052 in 29.02s
math from 2015-01-01 until 2015-12-31 (13001-14000)/450

45052

In [6]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-01-01", f"{year}-05-01", category=cat)

Writing to arXivMeta_physics_from_2017-01-01_to_2017-05-01.csv

physics from 2017-01-01 until 2017-05-01 (    1- 1000)/22316 in 33.18s
physics from 2017-01-01 until 2017-05-01 ( 1001- 2000)/22316 in 32.31s
physics from 2017-01-01 until 2017-05-01 ( 2001- 3000)/22316 in 28.52s
physics from 2017-01-01 until 2017-05-01 ( 3001- 4000)/22316 in 34.51s
physics from 2017-01-01 until 2017-05-01 ( 4001- 5000)/22316 in 28.56s
physics from 2017-01-01 until 2017-05-01 ( 5001- 6000)/22316 in 30.83s
physics from 2017-01-01 until 2017-05-01 ( 6001- 7000)/22316 in 29.97s
physics from 2017-01-01 until 2017-05-01 ( 7001- 8000)/22316 in 97.47s
physics from 2017-01-01 until 2017-05-01 ( 8001- 9000)/22316 in 105.98s
physics from 2017-01-01 until 2017-05-01 ( 9001-10000)/22316 in 90.21s
physics from 2017-01-01 until 2017-05-01 (10001-11000)/22316 in 67.14s
physics from 2017-01-01 until 2017-05-01 (11001-12000)/22316 in 64.60s
physics from 2017-01-01 until 2017-05-01 (12001-13000)/22316 in 51.89s
physics from

UnicodeEncodeError: 'charmap' codec can't encode characters in position 389-390: character maps to <undefined>

In [7]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-05-02", f"{year}-07-01", category=cat)

Writing to arXivMeta_physics_from_2017-05-02_to_2017-07-01.csv

physics from 2017-05-02 until 2017-07-01 (    1- 1000)/10406 in 130.66s
physics from 2017-05-02 until 2017-07-01 ( 1001- 2000)/10406 in 183.40s
physics from 2017-05-02 until 2017-07-01 ( 2001- 3000)/10406 in 63.10s
physics from 2017-05-02 until 2017-07-01 ( 3001- 4000)/10406 in 29.57s
physics from 2017-05-02 until 2017-07-01 ( 4001- 5000)/10406 in 27.15s
physics from 2017-05-02 until 2017-07-01 ( 5001- 6000)/10406 in 28.66s
physics from 2017-05-02 until 2017-07-01 ( 6001- 7000)/10406 in 28.66s
physics from 2017-05-02 until 2017-07-01 ( 7001- 8000)/10406 in 26.82s
physics from 2017-05-02 until 2017-07-01 ( 8001- 9000)/10406 in 26.57s
physics from 2017-05-02 until 2017-07-01 ( 9001-10000)/10406 in 25.91s
physics from 2017-05-02 until 2017-07-01 (10001-10406)/10406 in 7.98s
physics from 2017-05-02 until 2017-07-01 retrieved 10406 records in 10 min



10406

In [9]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-07-02", f"{year}-08-01", category=cat)

Writing to arXivMeta_physics_from_2017-07-02_to_2017-08-01.csv

physics from 2017-07-02 until 2017-08-01 (    1- 1000)/ 4688 in 24.29s
physics from 2017-07-02 until 2017-08-01 ( 1001- 2000)/ 4688 in 26.95s
physics from 2017-07-02 until 2017-08-01 ( 2001- 3000)/ 4688 in 29.75s
physics from 2017-07-02 until 2017-08-01 ( 3001- 4000)/ 4688 in 24.30s
physics from 2017-07-02 until 2017-08-01 ( 4001- 4688)/ 4688 in 10.85s
physics from 2017-07-02 until 2017-08-01 retrieved 4688 records in 2 min



4688

In [10]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-08-02", f"{year}-08-15", category=cat)

Writing to arXivMeta_physics_from_2017-08-02_to_2017-08-15.csv

physics from 2017-08-02 until 2017-08-15 (    1- 1000)/ 2823 in 45.31s
physics from 2017-08-02 until 2017-08-15 ( 1001- 2000)/ 2823 in 58.93s
physics from 2017-08-02 until 2017-08-15 ( 2001- 2823)/ 2823 in 56.99s
physics from 2017-08-02 until 2017-08-15 retrieved 2823 records in 3 min



2823

In [11]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-08-16", f"{year}-08-20", category=cat)

Writing to arXivMeta_physics_from_2017-08-16_to_2017-08-20.csv

physics from 2017-08-16 until 2017-08-20 (    1-  662)/  662 in 75.35s
physics from 2017-08-16 until 2017-08-20 retrieved 662 records in 1 min



662

In [15]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-08-21", f"{year}-08-21", category=cat)

Writing to arXivMeta_physics_from_2017-08-21_to_2017-08-21.csv



UnicodeEncodeError: 'charmap' codec can't encode characters in position 389-390: character maps to <undefined>

In [16]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-08-22", f"{year}-09-01", category=cat)

Writing to arXivMeta_physics_from_2017-08-22_to_2017-09-01.csv

physics from 2017-08-22 until 2017-09-01 (    1- 1000)/ 5683 in 24.57s
physics from 2017-08-22 until 2017-09-01 ( 1001- 2000)/ 5683 in 32.82s
physics from 2017-08-22 until 2017-09-01 ( 2001- 3000)/ 5683 in 27.74s
physics from 2017-08-22 until 2017-09-01 ( 3001- 4000)/ 5683 in 24.00s
physics from 2017-08-22 until 2017-09-01 ( 4001- 5000)/ 5683 in 23.78s
physics from 2017-08-22 until 2017-09-01 ( 5001- 5683)/ 5683 in 17.04s
physics from 2017-08-22 until 2017-09-01 retrieved 5683 records in 3 min



5683

In [8]:
year = 2017
cat = 'physics'
harvest_data(f"{year}-09-02", f"{year}-12-31", category=cat)

Writing to arXivMeta_physics_from_2017-09-02_to_2017-12-31.csv

physics from 2017-09-02 until 2017-12-31 (    1- 1000)/24150 in 33.95s
physics from 2017-09-02 until 2017-12-31 ( 1001- 2000)/24150 in 32.22s
physics from 2017-09-02 until 2017-12-31 ( 2001- 3000)/24150 in 27.45s
physics from 2017-09-02 until 2017-12-31 ( 3001- 4000)/24150 in 25.59s
physics from 2017-09-02 until 2017-12-31 ( 4001- 5000)/24150 in 30.78s
physics from 2017-09-02 until 2017-12-31 ( 5001- 6000)/24150 in 26.11s
physics from 2017-09-02 until 2017-12-31 ( 6001- 7000)/24150 in 27.98s
physics from 2017-09-02 until 2017-12-31 ( 7001- 8000)/24150 in 29.14s
physics from 2017-09-02 until 2017-12-31 ( 8001- 9000)/24150 in 28.11s
physics from 2017-09-02 until 2017-12-31 ( 9001-10000)/24150 in 29.45s
physics from 2017-09-02 until 2017-12-31 (10001-11000)/24150 in 28.80s
physics from 2017-09-02 until 2017-12-31 (11001-12000)/24150 in 30.68s
physics from 2017-09-02 until 2017-12-31 (12001-13000)/24150 in 26.99s
physics from 

24150

In [12]:
year = 2016
cat = 'physics'
harvest_data(f"{year}-01-01", f"{year}-05-01", category=cat)
harvest_data(f"{year}-05-02", f"{year}-09-01", category=cat)
harvest_data(f"{year}-09-02", f"{year}-12-31", category=cat)

Writing to arXivMeta_physics_from_2016-01-01_to_2016-05-01.csv

physics from 2016-01-01 until 2016-05-01 (    1- 1000)/21713 in 31.76s
physics from 2016-01-01 until 2016-05-01 ( 1001- 2000)/21713 in 34.57s
physics from 2016-01-01 until 2016-05-01 ( 2001- 3000)/21713 in 30.48s
physics from 2016-01-01 until 2016-05-01 ( 3001- 4000)/21713 in 28.68s
physics from 2016-01-01 until 2016-05-01 ( 4001- 5000)/21713 in 29.63s
physics from 2016-01-01 until 2016-05-01 ( 5001- 6000)/21713 in 26.87s
physics from 2016-01-01 until 2016-05-01 ( 6001- 7000)/21713 in 27.58s
physics from 2016-01-01 until 2016-05-01 ( 7001- 8000)/21713 in 27.87s
physics from 2016-01-01 until 2016-05-01 ( 8001- 9000)/21713 in 29.14s
physics from 2016-01-01 until 2016-05-01 ( 9001-10000)/21713 in 29.69s
physics from 2016-01-01 until 2016-05-01 (10001-11000)/21713 in 27.08s
physics from 2016-01-01 until 2016-05-01 (11001-12000)/21713 in 35.72s
physics from 2016-01-01 until 2016-05-01 (12001-13000)/21713 in 27.83s
physics from 

27849

In [163]:
year = 2015
cat = 'physics'
harvest_data(f"{year}-01-01", f"{year}-06-01", category=cat)

Writing to arXivMeta_physics_from_2015-01-01_to_2015-12-31.csv

physics from 2015-01-01 until 2015-12-31 (    1- 1000)/170251 in 28.50s
physics from 2015-01-01 until 2015-12-31 ( 1001- 2000)/170251 in 27.58s
physics from 2015-01-01 until 2015-12-31 ( 2001- 3000)/170251 in 26.90s
physics from 2015-01-01 until 2015-12-31 ( 3001- 4000)/170251 in 27.16s
physics from 2015-01-01 until 2015-12-31 ( 4001- 5000)/170251 in 25.46s
physics from 2015-01-01 until 2015-12-31 ( 5001- 6000)/170251 in 26.21s
physics from 2015-01-01 until 2015-12-31 ( 6001- 7000)/170251 in 26.06s
physics from 2015-01-01 until 2015-12-31 ( 7001- 8000)/170251 in 26.87s
physics from 2015-01-01 until 2015-12-31 ( 8001- 9000)/170251 in 27.72s
physics from 2015-01-01 until 2015-12-31 ( 9001-10000)/170251 in 28.31s
physics from 2015-01-01 until 2015-12-31 (10001-11000)/170251 in 28.09s
physics from 2015-01-01 until 2015-12-31 (11001-12000)/170251 in 26.01s
physics from 2015-01-01 until 2015-12-31 (12001-13000)/170251 in 25.33s


0