In [None]:
from config import Config
import networkx as nx
import json
import arxiv
from itertools import islice
import re
import pickle

papers = None
with open(Config.REDUCED_JSON_PATH, 'r') as infile:
    papers = json.load(infile)

unarxiv = None
with open(Config.UNARXIV_REDUCED_JSON_PATH, 'r') as infile:
    unarxiv = json.load(infile)

In [None]:
# Cursory density estimation

l = set(papers.keys())
found = {True: 0, False: 0}
citing = set()
cited = set()
all_citations = set()
for id, md in papers.items():
    for ref in md['arxiv_bib_ids']:
        all_citations.add(ref)
        if ref in l:
            found[True] += 1
            citing.add(id)
            cited.add(ref)
        else:
            found[False] += 1

print(f'''This dataset contains {len(papers)} papers making {sum(found.values())} citations \
of {len(all_citations)} unique works, {found[True]} of which reference other papers in the dataset.
{len(cited.intersection(citing))} papers in the dataset have both an incoming and outgoing citation in the set.''')

In [None]:
# Ensure papers is a subset of unarxiv so that unarxiv's keys can be used alone for node generation

all([k in unarxiv for k in papers])

In [None]:
g = nx.DiGraph()
for id, data in unarxiv.items():
    for bib_id in data['arxiv_bib_ids']:
        # Protection against self-loops and external citations
        if id != bib_id and bib_id in unarxiv:
            g.add_edge(bib_id, id)

In [None]:
nx.is_directed_acyclic_graph(g)

In [None]:
# Fetch publication dates for relevant papers

def batched(iterable, n):
    it = iter(iterable)
    while batch := tuple(islice(it, n)):
        yield batch

client = arxiv.Client()
id_strip_re = re.compile(r'(?P<id>.+)v\d+$')
for id_batch in batched(g.nodes, 500):
    search = arxiv.Search(
        id_list = list(id_batch)
    )

    for result in client.results(search):
        id = result.get_short_id()
        m = id_strip_re.match(id)
        if m:
            id = m.group('id')
        if id in unarxiv:
            unarxiv[id]['published'] = result.published
        else:
            print(f'{id} not found')
            exit(-1)

In [None]:
# Remove edges that run counter to chronological order

for src_id, dest_id in list(g.edges):
    if unarxiv[src_id]['published'] > unarxiv[dest_id]['published']:
        g.remove_edge(src_id, dest_id)

In [None]:
nx.is_directed_acyclic_graph(g)

In [None]:
with open(Config.GRAPH_BIN_PATH, 'wb') as outfile:
    pickle.dump(g, outfile)