In [1]:
import itertools
import json
import random
import re
import zipfile

import nltk.data
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

# if you get an error
# nltk.download('punkt')

ARXIV_ZIP = "arxiv dump/arxiv-metadata-oai-snapshot-version111.json.zip"
DUMP_JSON = "arxiv-metadata-oai-snapshot.json"
RANDOM_STATE = 1

random.seed(RANDOM_STATE)



def join_text(title, abstract):
    title = title.strip("").rstrip(".")
    t = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
    ss = [sn.replace("\n", " ") for sn in t.tokenize(re.sub(r"\s+", " ", "{}. {}".format(title, abstract)))]
    try:
        return " ".join(ss)
    except Exception as e:
        return ""



## Read publications from zipped arXiv JSON-dump

In [2]:
%%time
dfs = []
with zipfile.ZipFile(ARXIV_ZIP) as za:
    with tqdm(total=za.getinfo(DUMP_JSON).file_size, unit="b", unit_divisor=1024, unit_scale=True, desc=DUMP_JSON) as pb:
        with za.open(DUMP_JSON) as f:       
            for l in f:
                j = json.loads(l)
                dfs.append(pd.DataFrame([[j["title"], j["abstract"], j["categories"], j["doi"]]], columns=["title", "abstract", "categories", "doi"]))
                pb.update(len(l))
df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)

arxiv-metadata-oai-snapshot.json:   0%|          | 0.00/3.37G [00:00<?, ?b/s]

CPU times: user 7min 37s, sys: 8.74 s, total: 7min 45s
Wall time: 7min 43s


In [3]:
df["text"] = df.progress_apply(lambda r: join_text(r["title"], r["abstract"]), axis=1)
df = df[df["text"] != ""]
len(df)

  0%|          | 0/2187423 [00:00<?, ?it/s]

2187423

In [5]:
df.sample(5)

Unnamed: 0,categories,doi,text
1276796,math.AP math.OC,,Input-to-State Stability in sup norms for hype...
1650645,cond-mat.mtrl-sci,,"Structural, Dielectric, and Electrical Transpo..."
6374,hep-ex,,Measurement of the gluon polarisation at COMPA...
2011520,hep-ph,10.1143/PTPS.123.173,Looking Beyond the Standard Model through Prec...
351335,cs.AI,,Studies in Lower Bounding Probabilities of Evi...


In [4]:
df to_csvrop(["title", "abstract"], axis=1)
df.to_csv("abstracts-arxiv-dataset.csv")