In [6]:
import os
import json
import numpy as np
import pandas as pd
from copy import deepcopy
from tqdm import tqdm

In [7]:
data = "pdf_json/"
filenames = os.listdir(data)
print(len(filenames))


1934


In [8]:
all_files = []
for filename in filenames:
    filename = data + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [9]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [10]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

100%|██████████| 1934/1934 [00:05<00:00, 375.96it/s]


In [12]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,db26e8bd5250847218c510074b99460d67c29343,Title: Failed detection of the full-length gen...,"Fengyu Hu, Fengjuan Chen, Yaping Wang, Teng Xu...","Fengyu Hu (Guangzhou Medical University, 627 D...",Abstract\n\nOver 10 percent of recovered and d...,\n\nThe novel coronavirus (SARS-CoV-2) infecti...,Positive RT-PCR Test Results in Patients Recov...,"[{'first': 'Fengyu', 'middle': [], 'last': 'Hu...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Positiv..."
1,66478ffa5ba4f08de08cbe2006dca2ee463af8d4,Defining high-value information for COVID-19 d...,,,Abstract\n\nInitial projections from the first...,\n\nThe novel coronavirus SARS-CoV-2 has sprea...,We estimated hospitalizations through an appli...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'We esti..."
2,fec0c97b9bdc012b76f082b8bcd6ba0efbb4f5c3,Viral gain-of-function experiments uncover res...,"Rohan Maddamsetti, Daniel T Johnson, Stephanie...","Rohan Maddamsetti (Harvard Medical School, Bos...",Abstract\n\nViral gain-of-function mutations a...,"\n\npopulations. Indeed, some researchers have...",No reuse allowed without permission. The copyr...,"[{'first': 'Rohan', 'middle': [], 'last': 'Mad...","{'BIBREF0': {'ref_id': 'b0', 'title': 'No reus..."
3,331a7033bb8e6d4618e4e83b74aff1aec42764a6,TITLE: Pulmonary Metagenomic Sequencing Sugges...,,,,"BACKGROUND\n\nLast year in the United States, ...",Bill Young Cell Transplantation Program operat...,[],"{'BIBREF0': {'ref_id': 'b0', 'title': 'Bill Yo..."
4,7e80848c7b22e870b0e0a458dce96f6955576f04,The network structure and eco-evolutionary dyn...,"Shai Pilosof, Sergio A Alcala-Corona, Tong Wan...",Shai Pilosof (Ben-Gurion University of the Neg...,Abstract\n\nAs a heritable sequence-specific a...,\n\nantigens that are novel to the host immune...,Compartmentalization increases food-web persis...,"[{'first': 'Shai', 'middle': [], 'last': 'Pilo...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Compart..."


In [13]:
clean_df.to_csv('json_parse.csv', index=False)