In [2]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
data = "/home/phani/cord-19_2020-04-10/2020-04-10/biorxiv_medrxiv/pdf_json/"
filenames = os.listdir(data)
print(len(filenames))

1625


In [4]:
all_files = []
for filename in filenames:
    filename = data + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)
file = all_files[0]
print("Dictionary keys:", file.keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [11]:
print("body_text content:")
pprint(file['body_text'][:2], depth=3)

body_text content:
[{'cite_spans': [{...}, {...}, {...}, {...}, {...}, {...}],
  'ref_spans': [],
  'section': 'Introduction',
  'text': 'Since December 2019, coronavirus disease 2019 (COVID-19) caused by '
          'severe acute respiratory syndrome coronavirus-2 (SARS-CoV-2) has '
          'quickly spread across the world. 1 Approximately 20-30% of cases '
          'would develop severe illness, and some need further intervention in '
          'intensive care unit. Organ dysfunction including acute respiratory '
          'distress syndrome, shock, acute cardiac injury, and acute renal '
          'injury, could occur in severe cases with COVID-19, which lead to '
          'poor clinical outcome. 2, 3 Following SARS-CoV-2 infection, a high '
          'viral load and overexuberant host immune response involving innate '
          'and acquired immunity, simultaneously contributes to the '
          'pathogenesis of COVID-19 and organ injury. [2] [3] [4] The '
          'activate

In [12]:
texts = [(di['section'], di['text']) for di in file['body_text']]
texts_di = {di['section']: "" for di in file['body_text']}
for section, text in texts:
    texts_di[section] += text

pprint(list(texts_di.keys()))

['Introduction',
 'Data collection',
 'Antibody and cytokine assay',
 'Statistical analysis',
 'Results',
 'Discussion',
 'Contributors',
 'Declaration of interests',
 'Data sharing',
 'Acknowledgments',
 'Figure legends']


In [13]:
body = ""

for section, text in texts_di.items():
    body += section
    body += "\n\n"
    body += text
    body += "\n\n"

print(body[:3000])

Introduction

Since December 2019, coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus-2 (SARS-CoV-2) has quickly spread across the world. 1 Approximately 20-30% of cases would develop severe illness, and some need further intervention in intensive care unit. Organ dysfunction including acute respiratory distress syndrome, shock, acute cardiac injury, and acute renal injury, could occur in severe cases with COVID-19, which lead to poor clinical outcome. 2, 3 Following SARS-CoV-2 infection, a high viral load and overexuberant host immune response involving innate and acquired immunity, simultaneously contributes to the pathogenesis of COVID-19 and organ injury. [2] [3] [4] The activated host immunity is characterized as lymphopenia, cytokine release storm (CRS), and dysfunctional immune responses to virus-specific antigen. Increasing clinical data indicated that the neutrophil-to-lymphocyte ratio (NLR) was identified as a powerful predictive and p

In [15]:
#Seeing keys cobntained in metadata
print(all_files[0]['metadata'].keys())

dict_keys(['title', 'authors'])


In [19]:
#Seeing the title
all_files[0]['metadata']['title']

'Immune phenotyping based on neutrophil-to-lymphocyte ratio and IgG predicts disease severity and outcome for patients with COVID-19'

In [21]:
#printing the authors
authors = all_files[0]['metadata']['authors']
authors[:3]

[{'first': 'Bicheng',
  'middle': [],
  'last': 'Zhang',
  'suffix': '',
  'affiliation': {},
  'email': ''},
 {'first': 'Xiaoyang',
  'middle': [],
  'last': 'Zhou',
  'suffix': '',
  'affiliation': {},
  'email': ''},
 {'first': 'Chengliang',
  'middle': [],
  'last': 'Zhu',
  'suffix': '',
  'affiliation': {},
  'email': ''}]

In [29]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [30]:
for author in authors:
    print("Name:", format_name(author))
    print("Affiliation:", format_affiliation(author['affiliation']))
    print()

Name: Bicheng Zhang
Affiliation: 

Name: Xiaoyang Zhou
Affiliation: 

Name: Chengliang Zhu
Affiliation: 

Name: Fan Feng
Affiliation: 

Name: Yanru Qiu
Affiliation: 

Name: Jia Feng
Affiliation: 

Name: Qingzhu Jia
Affiliation: 

Name: Qibin Song
Affiliation: 

Name: Bo Zhu
Affiliation: 

Name: Jun Wang
Affiliation: 

Name:  Zhang
Affiliation: 

Name: Y Qiu
Affiliation: 

Name: F Feng
Affiliation: 

Name: J Feng
Affiliation: 

Name: ( Q Jia
Affiliation: 



In [31]:
#formating authors
authors = all_files[4]['metadata']['authors']
print("Formatting without affiliation:")
print(format_authors(authors, with_affiliation=False))
print("\nFormatting with affiliation:")
print(format_authors(authors, with_affiliation=True))

Formatting without affiliation:
Derrick Deming, Karen Lee, Tracey Mcsherry, Ronnie R Wei, Tim Edmunds, Scott C Garman

Formatting with affiliation:
Derrick Deming (University of Massachusetts, 9 01003, Amherst, MA, USA), Karen Lee (Biologics Research, 10 Massachusetts, 01701, Sanofi, Framingham, USA), Tracey Mcsherry (Biologics Research, 10 Massachusetts, 01701, Sanofi, Framingham, USA), Ronnie R Wei (Biologics Research, 10 Massachusetts, 01701, Sanofi, Framingham, USA), Tim Edmunds (Biologics Research, 10 Massachusetts, 01701, Sanofi, Framingham, USA), Scott C Garman (University of Massachusetts, 9 01003, Amherst, MA, USA)


In [32]:
bibs = list(file['bib_entries'].values())
pprint(bibs[:2], depth=4)

[{'authors': [{'first': 'F', 'last': 'Wu', 'middle': [], 'suffix': ''},
              {'first': 'S', 'last': 'Zhao', 'middle': [], 'suffix': ''},
              {'first': 'B', 'last': 'Yu', 'middle': [], 'suffix': ''}],
  'issn': '',
  'other_ids': {'DOI': ['10.1038/s41586-020-2008-3']},
  'pages': '',
  'ref_id': 'b0',
  'title': 'A new coronavirus associated with human respiratory disease in '
           'China',
  'venue': 'Nature',
  'volume': '',
  'year': 2020},
 {'authors': [{'first': 'C', 'last': 'Huang', 'middle': [], 'suffix': ''},
              {'first': 'Y', 'last': 'Wang', 'middle': [], 'suffix': ''},
              {'first': 'X', 'last': 'Li', 'middle': [], 'suffix': ''}],
  'issn': '',
  'other_ids': {'DOI': ['10.1016/S0140-6736(20)30183-5']},
  'pages': '497--506',
  'ref_id': 'b1',
  'title': 'Clinical features of patients infected with 2019 novel coronavirus '
           'in Wuhan, China',
  'venue': 'Lancet',
  'volume': '395',
  'year': 2020}]


In [33]:
format_authors(bibs[1]['authors'], with_affiliation=False)

'C Huang, Y Wang, X Li'

In [36]:
from copy import deepcopy
bib_formatted = format_bib(bibs[:5])
print(bib_formatted)

A new coronavirus associated with human respiratory disease in China, F Wu, S Zhao, B Yu, Nature, 2020; Clinical features of patients infected with 2019 novel coronavirus in Wuhan, China, C Huang, Y Wang, X Li, Lancet, 2020; Clinical characteristics of 138 hospitalized patients with 2019 novel coronavirus-infected pneumonia in Wuhan, China, D Wang, B Hu, C Hu, JAMA, 2020; Pathological findings of COVID-19 associated with acute respiratory distress syndrome, Z Xu, L Shi, Y Wang, Lancet, 2020; Neutrophil-to-lymphocyte ratio predicts severe illness patients with 2019 novel coronavirus in the early stage, J Liu, Y Liu, P Xiang, , None


In [37]:
cleaned_files = []

for file in tqdm(all_files):
    features = [
        file['paper_id'],
        file['metadata']['title'],
        format_authors(file['metadata']['authors']),
        format_authors(file['metadata']['authors'], 
                       with_affiliation=True),
        format_body(file['abstract']),
        format_body(file['body_text']),
        format_bib(file['bib_entries']),
        file['metadata']['authors'],
        file['bib_entries']
    ]
    
    cleaned_files.append(features)

100%|██████████| 1625/1625 [00:02<00:00, 809.69it/s]


In [38]:
col_names = [
    'paper_id', 
    'title', 
    'authors',
    'affiliations', 
    'abstract', 
    'text', 
    'bibliography',
    'raw_authors',
    'raw_bibliography'
]

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()


Unnamed: 0,paper_id,title,authors,affiliations,abstract,text,bibliography,raw_authors,raw_bibliography
0,97e0efc17b5a10c75f7c83b08423d27585a31df5,Immune phenotyping based on neutrophil-to-lymp...,"Bicheng Zhang, Xiaoyang Zhou, Chengliang Zhu, ...","Bicheng Zhang, Xiaoyang Zhou, Chengliang Zhu, ...",,"Introduction\n\nSince December 2019, coronavir...",A new coronavirus associated with human respir...,"[{'first': 'Bicheng', 'middle': [], 'last': 'Z...","{'BIBREF0': {'ref_id': 'b0', 'title': 'A new c..."
1,33eff3ff8721d2abd9204e3aabd504e167f3f01d,,"Yoshiyuki Sugishita, Junko Kurita, Tamie Sugaw...",Yoshiyuki Sugishita (National Institute of Inf...,Abstract\n\nBackground: To control COVID-19 ou...,Introduction\n\nThe initial case of COVID-19 i...,"Japan Ministry of Health, Labour and Welfare. ...","[{'first': 'Yoshiyuki', 'middle': [], 'last': ...","{'BIBREF0': {'ref_id': 'b0', 'title': 'Japan M..."
2,c03736c4e12dd89176ccb6d67df4b0bdaecf15dd,Temperature dependence of COVID-19 transmission,Alessio Notari,"Alessio Notari (Universitat de Barcelona, Mart...",Abstract\n\nThe recent coronavirus pandemic fo...,INTRODUCTION\n\nThe recent coronavirus (COVID-...,The Effects of Temperature and Relative Humidi...,"[{'first': 'Alessio', 'middle': [], 'last': 'N...","{'BIBREF0': {'ref_id': 'b0', 'title': 'The Eff..."
3,1f632ba80bc3f4c8a48b5d436dd843c3c48bbcf2,PriSeT: Efficient De Novo Primer Discovery,"Marie Hoffmann, Michael T Monaghan, Knut Reinert","Marie Hoffmann, Michael T Monaghan, Knut Reinert",Abstract\n\nMotivation: DNA metabarcoding is a...,\n\nmonly applied technique used to infer the ...,A molecular evolutionary framework for the phy...,"[{'first': 'Marie', 'middle': [], 'last': 'Hof...","{'BIBREF1': {'ref_id': 'b1', 'title': 'A molec..."
4,3c70c99afc7a38df3c4807857856ea258d378429,The molecular basis for Pompe disease 3 reveal...,"Derrick Deming, Karen Lee, Tracey Mcsherry, Ro...","Derrick Deming (University of Massachusetts, 9...",Abstract\n\nPompe disease results from a defec...,\n\nIntroduction encompasses residues 347-726 ...,therapy (ERT) in late-onset Type II Glycogenos...,"[{'first': 'Derrick', 'middle': [], 'last': 'D...","{'BIBREF0': {'ref_id': 'b0', 'title': 'therapy..."


In [39]:
clean_df.to_csv('biorxiv_clean.csv', index=False)