In [9]:
import nltk.data
import random
import re


def draw_sentences(s, n, min_len):
    t = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
    ss = [sn.replace("\n", " ") for sn in t.tokenize(re.sub(r"\s+", " ", s)) if len(sn) >= min_len]
    try:
        return tuple(random.sample(ss, n))
    except Exception as e:
        pass
    return tuple([""]*n)

draw_sentences("1234567890. "
    "Disagreement with the ideas, doctrines, decrees, etc. of a political party, government or religion. "
    "An act of disagreeing with, or deviating from, the views and opinions of those holding authority. "
    "(Anglo-American common law) A separate opinion filed in a case by judges who disagree with the outcome of the majority of the court in that case. "
    "(sports) A violation that arises when disagreement with an official call is expressed in an inappropriate manner such as foul language, rude gestures, of failure to comply.", 2, 10)

('1234567890.', 'of a political party, government or religion.')

In [7]:
import itertools
import json
import random
import re
import zipfile

import nltk.data
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

# if you get an error
# nltk.download('punkt')

ARXIV_ZIP = "arxiv dump/arxiv-metadata-oai-snapshot-version111.json.zip"
DUMP_JSON = "arxiv-metadata-oai-snapshot.json"
RANDOM_STATE = 1

random.seed(RANDOM_STATE)


def draw_sentences(s, n, min_len):
    t = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
    ss = [sn.replace("\n", " ") for sn in t.tokenize(re.sub(r"\s+", " ", s)) if len(sn) >= min_len]
    try:
        return tuple(random.sample(ss, n))
    except Exception as e:
        pass
    return tuple([""]*n)
    

def draw(df, cat, min_len=10, ratio=0.1, random_state=None):
    sdf = df[df["categories"].map(lambda c: cat in c)]
    if len(sdf)==0:
        return pd.DataFrame()
    num_samples = int(len(sdf)*ratio)
    if num_samples == 0:
        num_samples = 1
    samples = sdf.sample(num_samples, random_state=RANDOM_STATE)
    samples["a"], samples["p"] = zip(*samples["abstract"].map(lambda c: draw_sentences(c, 2, min_len)))
    samples["n"] = list(tqdm(map(lambda c: draw_sentences(random.sample(tuple(sdf[sdf["title"] != c]["abstract"].values), 1)[0], 1, min_len)[0], samples["title"].values), total=len(samples["title"]), desc=cat))
    return samples

## Read publications from zipped arXiv JSON-dump

In [2]:
%%time
dfs = []
with zipfile.ZipFile(ARXIV_ZIP) as za:
    with tqdm(total=za.getinfo(DUMP_JSON).file_size, unit="b", unit_divisor=1024, unit_scale=True, desc=DUMP_JSON) as pb:
        with za.open(DUMP_JSON) as f:       
            for l in f:
                j = json.loads(l)
                dfs.append(pd.DataFrame([[j["title"], j["abstract"], j["categories"], j["doi"]]], columns=["title", "abstract", "categories", "doi"]))
                pb.update(len(l))
df = pd.concat(dfs, ignore_index=True).reset_index(drop=True)

arxiv-metadata-oai-snapshot.json:   0%|          | 0.00/3.37G [00:00<?, ?b/s]

CPU times: user 7min 36s, sys: 9.04 s, total: 7min 45s
Wall time: 7min 43s


In [3]:
# preprocessing
df["categories"] = df["categories"].map(lambda c: tuple(c.split()))
df.sample(5)

Unnamed: 0,title,abstract,categories,doi
1067979,Open Power System Data - Frictionless data for...,The quality of electricity system modelling ...,"(cs.CY,)",10.1016/j.apenergy.2018.11.097
199986,F_2^bbbar measurement at ZEUS,Two recent measurements of beauty production...,"(hep-ex,)",
1527128,OPIRL: Sample Efficient Off-Policy Inverse Rei...,Inverse Reinforcement Learning (IRL) is attr...,"(cs.LG, cs.AI, cs.RO)",
1842109,A model for OH(1720 MHz) masers associated wit...,OH(1720 MHz) masers unaccompanied by 1665/7 ...,"(astro-ph,)",
2090350,Green's formulas for cone differential operators,Green's formulas for elliptic cone different...,"(math.AP,)",


In [4]:
len(df)

2187423

## Draw triplet-samples per arXiv math category

In [7]:
%%time
math_cats = sorted([c for c in set(itertools.chain.from_iterable(df["categories"])) if c.startswith("math") or c.startswith("stat")])
# draw samples per math class
math_dfs = dict(tqdm(((c, draw(df, c, min_len=35, random_state=RANDOM_STATE)) for c in math_cats), total=len(math_cats), desc="Categories"))
#{ k: draw(df, k, min_len=35, random_state=RANDOM_STATE) for k in set(itertools.chain.from_iterable(df["categories"])) if k.startswith("math") or k.startswith("stat") }

# combine results as final dataset
math_dataset = pd.concat(math_dfs.values(), ignore_index=True)
# remove rows with empty a, p, or n column
math_dataset = math_dataset[(math_dataset["a"] != "") & (math_dataset["p"] != "") & (math_dataset["n"] != "")].reset_index(drop=True)

print("num examples:", len(math_dataset))
print("unique a:", len(math_dataset["a"].unique()))
print("unique p:", len(math_dataset["p"].unique()))
print("unique n:", len(math_dataset["n"].unique()))
print("unique a & p", len(pd.concat([math_dataset["a"], math_dataset["p"]], ignore_index=True).unique()))
print("unique a & p & n", len(pd.concat([math_dataset["a"], math_dataset["p"], math_dataset["n"]], ignore_index=True).unique()))

Categories:   0%|          | 0/39 [00:00<?, ?it/s]

math-ph:   0%|          | 0/7265 [00:00<?, ?it/s]

math.AC:   0%|          | 0/1160 [00:00<?, ?it/s]

math.AG:   0%|          | 0/4771 [00:00<?, ?it/s]

math.AP:   0%|          | 0/5413 [00:00<?, ?it/s]

math.CT:   0%|          | 0/742 [00:00<?, ?it/s]

math.CV:   0%|          | 0/1542 [00:00<?, ?it/s]

math.DG:   0%|          | 0/3828 [00:00<?, ?it/s]

math.DS:   0%|          | 0/3224 [00:00<?, ?it/s]

math.FA:   0%|          | 0/2775 [00:00<?, ?it/s]

math.GM:   0%|          | 0/353 [00:00<?, ?it/s]

math.GN:   0%|          | 0/461 [00:00<?, ?it/s]

math.GR:   0%|          | 0/1988 [00:00<?, ?it/s]

math.GT:   0%|          | 0/2076 [00:00<?, ?it/s]

math.HO:   0%|          | 0/322 [00:00<?, ?it/s]

math.IT:   0%|          | 0/4134 [00:00<?, ?it/s]

math.KT:   0%|          | 0/540 [00:00<?, ?it/s]

math.LO:   0%|          | 0/1167 [00:00<?, ?it/s]

math.MG:   0%|          | 0/997 [00:00<?, ?it/s]

math.MP:   0%|          | 0/7265 [00:00<?, ?it/s]

math.NA:   0%|          | 0/3138 [00:00<?, ?it/s]

math.NT:   0%|          | 0/3713 [00:00<?, ?it/s]

math.OA:   0%|          | 0/1155 [00:00<?, ?it/s]

math.OC:   0%|          | 0/3989 [00:00<?, ?it/s]

math.PR:   0%|          | 0/5006 [00:00<?, ?it/s]

math.QA:   0%|          | 0/1771 [00:00<?, ?it/s]

math.RA:   0%|          | 0/1558 [00:00<?, ?it/s]

math.RT:   0%|          | 0/2349 [00:00<?, ?it/s]

math.SG:   0%|          | 0/830 [00:00<?, ?it/s]

math.SP:   0%|          | 0/876 [00:00<?, ?it/s]

math.ST:   0%|          | 0/2004 [00:00<?, ?it/s]

stat.AP:   0%|          | 0/1548 [00:00<?, ?it/s]

stat.CO:   0%|          | 0/704 [00:00<?, ?it/s]

stat.ME:   0%|          | 0/2168 [00:00<?, ?it/s]

stat.ML:   0%|          | 0/5735 [00:00<?, ?it/s]

stat.OT:   0%|          | 0/113 [00:00<?, ?it/s]

stat.TH:   0%|          | 0/2004 [00:00<?, ?it/s]

num examples: 92202
unique a: 89350
unique p: 89293
unique n: 89865
unique a & p 172826
unique a & p & n 255671
CPU times: user 11min 59s, sys: 6.33 s, total: 12min 6s
Wall time: 12min 2s


In [8]:
math_dataset.sample(10)

Unnamed: 0,title,abstract,categories,doi,a,p,n
48677,A generalisation of the relation between zeros...,The zeros of the random Laurent series $1/\m...,"(math-ph, math.MP, math.PR)",,The zeros of the random Laurent series $1/\mu...,Since the correlation functions of the latter ...,"In this limit, the power dissipated by the fie..."
50760,Homogenization for a Class of Generalized Lang...,We study a class of systems whose dynamics a...,"(math-ph, math.MP, math.PR)",10.1007/s10955-018-2192-9,The convergence results are obtained using the...,We apply our results to study thermophoresis o...,"Motivated by this, we seek for explicit soluti..."
26042,Global Dimension of Polynomial Rings in Partia...,For any free partially commutative monoid $M...,"(math.CT, math.KT)",10.1007/s00233-010-9264-8,"As a corollary, we generalize Hilbert's Syzygy...",For any free partially commutative monoid $M(...,A modified version of the join construction ca...
30353,Snapping elastic curves as a one-dimensional a...,In order to study a one-dimensional analogue...,"(math.AP, math-ph, math.DG, math.MP)",10.1142/S0218202511005234,Each phase induces a preferred curvature to th...,The theoretical result is illustrated by some ...,We prove several vanishing theorems for a cla...
63552,Online Convex Optimization Using Coordinate De...,This paper considers the problem of online o...,"(math.OC, cs.SY, eess.SY)",,This paper considers the problem of online op...,Instead of solving the problem exactly at each...,The monograph can be useful for undergraduate ...
54172,A multisymplectic approach to defects in integ...,We introduce the concept of multisymplectic ...,"(math-ph, hep-th, math.MP, nlin.SI)",10.1007/JHEP02(2015)088,"Taking the nonlinear Schr\""odinger (NLS) equat...",It allows us to reinterpret the defect conditi...,We determine various new equivalence pairs for...
2703,Four-parameter families of complex Hadamard ma...,In this paper we provide a general method to...,"(math-ph, math.MP, quant-ph)",,Our approach is to write a 6-dimensional matri...,We hope that the problem of mutually unbiased ...,We prove that the $k$-particle density matrice...
57714,New efficient time-stepping schemes for the an...,"In this paper, we propose and analyze a firs...","(math.NA, cs.NA)",,"That is, it only requires solving four linear ...",A detailed comparison with existing schemes is...,Some sharp inequalities of Gruss type for seq...
15432,Blow-up problems for nonlinear parabolic equat...,"Let $G=(V,E)$ be a locally finite connected ...","(math.AP,)",,The blow-up phenomenons of the equation are di...,"Let $G=(V,E)$ be a locally finite connected w...","Additionally, we establish a blowup criterion ..."
1940,The Galois coaction on the electron anomalous ...,Recently S. Laporta published a partial resu...,"(math-ph, hep-th, math.MP, math.NT, quant-ph)",,We prove this conjecture in the motivic setup.,The conversion into the $f$ alphabet relies on...,The rate of wear of the contact surface is des...


## Export dataset

In [9]:
math_dataset.to_csv("anchor-arxiv-dataset.csv")