In [1]:
from config import Config
import json
import tarfile

In [2]:
def process_jsonl(src) -> dict:
    # Read papers from JSONL and store them in a dict
    papers = {}
    for line in src.readlines():
        paper = json.loads(line)

        # This is aggressive dimensionality reduction - see README for discussion of why these columns were selected
        papers[paper['paper_id']] = {
            'title': paper['metadata']['title'],
            'authors': None, # Populated below
            # TODO: Cosider splitting categories into prefix and postfix (nested dict?) for better similarity comparisons between papers
            'categories': paper['metadata']['categories'].split(' '),
            'abstract': paper['metadata']['abstract'],
            'discipline': paper['discipline'],
            'arxiv_bib_ids': None # Populated below
        }

        authors = []
        for author in paper['metadata']['authors_parsed']:
            authors.append(' '.join(author[1::-1]))
        papers[paper['paper_id']]['authors'] = authors

        bib_ids = set()
        for bib_entry in paper['bib_entries'].values():
            if 'ids' in bib_entry and bib_entry['ids']['arxiv_id'] != '':
                bib_ids.add(bib_entry['ids']['arxiv_id'])
            for id in bib_entry['contained_arXiv_ids']:
                bib_ids.add(id['id'])
        papers[paper['paper_id']]['arxiv_bib_ids'] = list(bib_ids)
    return papers

In [3]:
# Write parsed dict to new JSON file
with open(Config.REDUCED_JSON_PATH, 'w') as outfile:
    # Indentation has small file size and performance impacts, but the legibility is worth it here.
    json.dump(process_jsonl(open(Config.DATA_JSONL_PATH)), outfile, indent=2)

In [5]:
# unarXiv processing
unarxiv = {}
with tarfile.open(Config.UNARXIV_TAR_PATH) as archive:
    for entry in archive:
        unarxiv.update(process_jsonl(archive.extractfile(entry)))

In [6]:
with open('../data/ingested/unarxiv.json', 'w') as outfile:
    json.dump(unarxiv, outfile, indent=2)