Re-using some functions from https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv

In [None]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

In [4]:
biorxiv_dir = './input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

Number of articles retrieved from biorxiv: 885


In [5]:
cleaned_files = []

for file in tqdm(all_files):
    temp = format_body(file['body_text'])
    temp = temp.replace('\n',' ')
 
    test = []
    test.append(temp)
    temp = test
    features = [
        file['metadata']['title'],
        temp,
    ]
    
    cleaned_files.append(features)

HBox(children=(IntProgress(value=0, max=885), HTML(value='')))




In [7]:
col_names = [
    'title',
    'paragraphs']

clean_df = pd.DataFrame(cleaned_files, columns=col_names)
clean_df.head()

Unnamed: 0,title,paragraphs
0,Multimerization of HIV-1 integrase hinges on c...,"[ In the absence of a curative treatment, the..."
1,Time-varying transmission dynamics of Novel Co...,"[Introduction Eighteen years ago, severe acut..."
2,p53 is not necessary for DUX4 pathology,[Introduction Facioscapulohumeral muscular dy...
3,Virological assessment of hospitalized cases o...,"[cases, providing proof of active virus replic..."
4,Potential impact of seasonal forcing on a SARS...,[ (2.2 with 90% high density interval 1.4-3.8...


In [9]:
clean_df.to_csv('./input/biorxiv_clean_temp.csv', index=False)