In [2]:
import datetime
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import os
import pandas as pd
import random
import re
import requests
import time
import xml.etree.ElementTree as ET


def tokenize_alphanum(text):
    try:
        text = text.decode('utf-8')
    except AttributeError:
        pass
    words = list(filter(('').__ne__, re.split('[^a-zA-Z]',text)))
    _words = []
    for w in words:
        w = w.lower()
        w = lemmatizer.lemmatize(w)
        if w in stop_words:
            continue
        _words.append(w)
    return _words

def finished(n_papers):
    i = 0
    for filename in os.listdir("data/"):
        if not (filename.startswith("arxiv") and filename.endswith(".json")):
            continue
        i += 1
    return i*1000 >= (n_papers - 1000)

def get_file_number(n_papers):
    # Then check if any files have been generated yet                                                                                                                             
    file_numbers = []
    for filename in os.listdir("data/"):
        if not (filename.startswith("arxiv") and filename.endswith(".json")):
            continue
        file_number = int(filename.split("-")[1].split(".")[0])
        file_numbers.append(file_number)
    all_file_numbers = set(x for x in np.arange(0, n_papers, 1000))
    return random.choice(list(all_file_numbers.difference(file_numbers)))

In [2]:
OAI = "{http://www.openarchives.org/OAI/2.0/}"
ARXIV = "{http://arxiv.org/OAI/arXiv/}"
N_PAPERS = 1385353
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
if not os.path.exists("data"):
    os.mkdir("data")

In [4]:
def get_arxiv_data():
    i_req = 0
    params = dict(verb='ListRecords', metadataPrefix='arXiv')

    # Acquire a resumption token seed                                            
    r = requests.get('http://export.arxiv.org/oai2', params=params)
    xml = r.text
    root = ET.fromstring(xml)
    token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
    seed_token = token.text.split("|")[0]
    time.sleep(25)

    all_ids = set()

    print("Using params", params)
    while True:
        output = []
        file_number = get_file_number(N_PAPERS)
        resumptionToken = seed_token+"|"+str(file_number+1)
        params = dict(verb='ListRecords', resumptionToken=resumptionToken)
        i_req = int(1 + file_number/1000)

        r = requests.get('http://export.arxiv.org/oai2', params=params)
        xml = r.text
        root = ET.fromstring(xml)
        n_ids_before = len(all_ids)
        records = root.find(OAI+'ListRecords')

        for record in records.findall(OAI+"record"):
            arxiv_id = record.find(OAI+'header').find(OAI+'identifier').text
            all_ids.add(arxiv_id)
            meta = record.find(OAI+'metadata')
            if meta is None:
                continue
            info = meta.find(ARXIV+"arXiv")
            categories = info.find(ARXIV+"categories").text
            raw_title = info.find(ARXIV+"title").text.strip()
            raw_summary = info.find(ARXIV+"abstract").text.strip()
            title = tokenize_alphanum(raw_title)
            summary = tokenize_alphanum(raw_summary)
            created = info.find(ARXIV+"created").text
            created = datetime.datetime.strptime(created, "%Y-%m-%d")
            # Fill row data                                                      
            row = dict(id=arxiv_id, categories=categories, 
                       raw_title=raw_title, raw_summary=raw_summary,
                       summary=summary, title=title, created=created)
            output.append(row)

        # Get next token                                                  
        token = root.find(OAI+'ListRecords').find(OAI+"resumptionToken")
        if token is None or token.text is None:
            break
        # params = dict(verb='ListRecords', resumptionToken=token.text)   
        # Get cursor and write to file                                    
        cursor = token.attrib['cursor']
        filename = "data/arxiv-"+str(cursor)+".json"
        d_n = len(all_ids) - n_ids_before
        print(i_req, "Writing", d_n, "to", filename, "(", token.text, ")")
        pd.DataFrame(output).to_json(filename, orient='records')
        time.sleep(25)
    return True
        
while not finished(N_PAPERS):
    try:
        get_arxiv_data()
    except Exception:
        print("Got",str(Exception))
        time.sleep(25)
        print("Restarting...")
        print()

Using params {'verb': 'ListRecords', 'metadataPrefix': 'arXiv'}
516 Writing 1000 to data/arxiv-515000.json ( 2608011|516001 )
34 Writing 1000 to data/arxiv-33000.json ( 2608011|34001 )
1173 Writing 1000 to data/arxiv-1172000.json ( 2608011|1173001 )
768 Writing 1000 to data/arxiv-767000.json ( 2608011|768001 )
87 Writing 1000 to data/arxiv-86000.json ( 2608011|87001 )
447 Writing 1000 to data/arxiv-446000.json ( 2608011|447001 )
561 Writing 1000 to data/arxiv-560000.json ( 2608011|561001 )
633 Writing 1000 to data/arxiv-632000.json ( 2608011|633001 )
831 Writing 1000 to data/arxiv-830000.json ( 2608011|831001 )
1354 Writing 1000 to data/arxiv-1353000.json ( 2608011|1354001 )
964 Writing 1000 to data/arxiv-963000.json ( 2608011|964001 )
77 Writing 1000 to data/arxiv-76000.json ( 2608011|77001 )
733 Writing 1000 to data/arxiv-732000.json ( 2608011|733001 )
1289 Writing 1000 to data/arxiv-1288000.json ( 2608011|1289001 )
1253 Writing 1000 to data/arxiv-1252000.json ( 2608011|1253001 )
106

In [3]:
# Also create a convenience subset for articles with cs.*, stat.ML and summaries greater than 20 tokens
data = []
top_dir = "data/"
for file_name in os.listdir(top_dir):
    if not (file_name.startswith("arxiv") and file_name.endswith(".json")):
        continue
    _df = pd.read_json(top_dir+file_name,orient="records")
    _df.categories = _df.categories.apply(lambda x: x.split())
    condition = _df.categories.apply(lambda x : "stat.ML" in x) 
    condition = condition | _df.categories.apply(lambda x : any(y.startswith("cs.") for y in x))
    condition = condition & (_df.summary.apply(lambda s : len(s) > 20))
    new_df = _df.loc[condition].copy()
    data.append(new_df)
    del (_df)
df = pd.concat(data)

In [4]:
df.to_json("data/cs_arxiv.json", orient="records")