In [46]:
import itertools
import json
import random
import re
import zipfile

import nltk.data
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

# if you get an error
# nltk.download('punkt')

ARXIV_ZIP = "arxiv dump/arxiv-metadata-oai-snapshot-version111.json.zip"
DUMP_JSON = "arxiv-metadata-oai-snapshot.json"
RANDOM_STATE = 1

random.seed(RANDOM_STATE)



def draw_similar(df, cat, ratio=.1, random_state=None):
    sdf = df[df["categories"].map(lambda c: cat in c)]
    if len(sdf)==0:
        return pd.DataFrame()
    if ratio < 1:
        ratio = 2*ratio
    num_samples = int(len(sdf)*ratio)
    if num_samples == 0:
        num_samples = 1 
    if num_samples % 2 != 0:
        num_samples = num_samples+1
        
    titles = sdf.sample(num_samples, random_state=random_state).reset_index(drop=True)
    similar_titles = titles.iloc[int(num_samples/2):].reset_index(drop=True)
    titles = titles.iloc[:int(num_samples/2)]
    
    samples = titles.reset_index(drop=True).merge(similar_titles,
            left_index=True, right_index=True, suffixes=("_a", "_b"))
    
    return samples


def draw_nonsimilar(df, cat, num_samples, random_state=None):
    sdf = df[df["categories"].map(lambda c: cat in c)]
    odf = df[df["categories"].map(lambda c: cat not in c)]
    titles = sdf.sample(num_samples, random_state=RANDOM_STATE)
    other_titles = odf.sample(num_samples, random_state=RANDOM_STATE)
    
    samples = titles.reset_index(drop=True).merge(other_titles.reset_index(drop=True),
            left_index=True, right_index=True, suffixes=("_a", "_b"))
    return samples


def draw_equal(df, cat, labels=(1,0), ratio=0.1, random_state=None):
    # if sampling one class fails, ignore the complete MSC
    try:
        s = draw_similar(df, cat, ratio=ratio, random_state=random_state)
        s["label"] = labels[0]
        # maybe we need another random state here, as it could draw the same titles with different other_titles as p_samples() 
        n = draw_nonsimilar(df, cat, len(s), random_state=random_state)
        n["label"] = labels[1]
        e = pd.concat([s, n], ignore_index=True).reset_index(drop=True)
    except Exception as e:
        print("{}: {}".format(msc, e))
        e = pd.DataFrame()
    return e

## Read publications from zipped arXiv JSON-dump

In [4]:
%%time
dfs = []
with zipfile.ZipFile(ARXIV_ZIP) as za:
    with tqdm(total=za.getinfo(DUMP_JSON).file_size, unit="b", unit_divisor=1024, unit_scale=True, desc=DUMP_JSON) as pb:
        with za.open(DUMP_JSON) as f:       
            for l in f:
                j = json.loads(l)
                dfs.append(pd.DataFrame([[j["title"], j["abstract"], j["categories"], j["doi"]]], columns=["title", "abstract", "categories", "doi"]))
                pb.update(len(l))
df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)

arxiv-metadata-oai-snapshot.json:   0%|          | 0.00/3.37G [00:00<?, ?b/s]

CPU times: user 7min 39s, sys: 9.04 s, total: 7min 48s
Wall time: 7min 46s


In [5]:
# preprocessing
df["categories"] = df["categories"].map(lambda c: tuple(c.split()))
df.sample(5)

Unnamed: 0,title,abstract,categories,doi
610387,"Energy levels, radiative rates and electron im...","We report energy levels, radiative rates (A-...","(astro-ph.SR, physics.atom-ph)",10.1093/mnras/stv684
1708383,Non-perturbative production of fermionic dark ...,We investigate non-perturbative production o...,"(gr-qc, astro-ph.CO, hep-ph, hep-th)",
1690970,Developers Struggle with Authentication in Bla...,WebAssembly is a growing technology to build...,"(cs.CR, cs.SE)",
1582783,The Effective Radius of Self Repelling Elastic...,We study elastic manifolds with self-repelli...,"(math.PR,)",
1235665,Constraints on the Spacetime Dynamics of an Ea...,We consider an Early Dark Energy (EDE) cosmo...,"(astro-ph.CO,)",10.1088/1475-7516/2020/07/039


In [6]:
math_cats = sorted([c for c in set(itertools.chain.from_iterable(df["categories"])) if c.startswith("math") or c.startswith("stat")])
print(len(df))
df = df[df["categories"].map(lambda c: len(set(c) & set(math_cats)) > 0)]
df

2187423


Unnamed: 0,title,abstract,categories,doi
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...","(math.CO, cs.CG)",
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,"(math.CO,)",
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,"(math.CA, math.FA)",
9,"Partial cubes: structures, characterizations, ...",Partial cubes are isometric subgraphs of hyp...,"(math.CO,)",
10,Computing genus 2 Hilbert-Siegel modular forms...,In this paper we present an algorithm for co...,"(math.NT, math.AG)",
...,...,...,...,...
2187319,Yang-Baxter Algebra for the n-Harmonic Oscilla...,Using a rational R-matrix associated with th...,"(solv-int, math-ph, math.MP, nlin.SI)",10.1016/S0370-2693(99)01261-7
2187332,Integrable deformations of oscillator chains f...,A family of completely integrable nonlinear ...,"(solv-int, math.QA, nlin.SI)",10.1088/0305-4470/32/50/306
2187333,A note on real forms of the complex N=4 supers...,Three inequivalent real forms of the complex...,"(solv-int, hep-th, math-ph, math.MP, nlin.SI)",10.1016/S0550-3213(00)00121-8
2187347,Real forms of the complex twisted N=2 supersym...,Three nonequivalent real forms of the comple...,"(solv-int, hep-th, math-ph, math.MP, nlin.SI)",10.2991/jnmp.2000.7.4.3


In [7]:
df = df[df["categories"].map(lambda c: len(c) == 1)]
len(df)

272292

In [15]:
def join_text(title, abstract):
    title = title.strip("").rstrip(".")
    t = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
    ss = [sn.replace("\n", " ") for sn in t.tokenize(re.sub(r"\s+", " ", "{}. {}".format(title, abstract)))]
    try:
        return " ".join(ss)
    except Exception as e:
        return ""

df["text"] = df.progress_apply(lambda r: join_text(r["title"], r["abstract"]), axis=1)
df = df[df["text"] != ""]

  0%|          | 0/272292 [00:00<?, ?it/s]

In [57]:
%%time

# draw samples per category
cat_samples = { cat: draw_equal(df, cat, random_state=RANDOM_STATE) for cat in math_cats }

# combine all results as final data set
dataset = pd.concat(cat_samples.values(), ignore_index=True)

CPU times: user 8.84 s, sys: 232 ms, total: 9.07 s
Wall time: 9.07 s


In [58]:
dataset = dataset.drop(["title_a", "abstract_a", "doi_a", "title_b", "abstract_b", "doi_b"], axis=1)
dataset.sample(5)

Unnamed: 0,label,categories_a,text_a,categories_b,text_b
8969,0,"(math.AP,)",Lower bounds on blowing-up solutions of the 3D...,"(math.AG,)",Multiple cover formula of generalized DT invar...
31550,1,"(math.HO,)",A Numerical Precision Example for Teachers of ...,"(math.HO,)",The Triangle of Smallest Area Which Circumscri...
3480,0,"(math.AG,)",The geometric genus of hypersurface singularit...,"(math.GM,)",Banishing divergence Part 1: Infinite numbers ...
44586,1,"(math.PR,)",Characterization of pinched Ricci curvature by...,"(math.PR,)",On the first-passage area of a L$\acute{\text{...
45583,0,"(math.PR,)",On the existence and position of the farthest ...,"(math.DG,)",Scalar curvatures of invariant almost Hermitia...


In [59]:
dataset.to_csv("class-arxiv-dataset.csv")