Re-using some functions from https://www.kaggle.com/xhlulu/cord-19-eda-parse-json-and-generate-clean-csv

In [1]:
import os
import json
from pprint import pprint
from copy import deepcopy

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import re
from nltk import tokenize



In [2]:
def format_name(author):
    middle_name = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_name, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliation(affiliation):
    text = []
    location = affiliation.get('location')
    if location:
        text.extend(list(affiliation['location'].values()))
    
    institution = affiliation.get('institution')
    if institution:
        text = [institution] + text
    return ", ".join(text)

def format_authors(authors, with_affiliation=False):
    name_ls = []
    
    for author in authors:
        name = format_name(author)
        if with_affiliation:
            affiliation = format_affiliation(author['affiliation'])
            if affiliation:
                name_ls.append(f"{name} ({affiliation})")
            else:
                name_ls.append(name)
        else:
            name_ls.append(name)
    
    return ", ".join(name_ls)

def format_body(body_text):
    texts = [(di['section'], di['text']) for di in body_text]
    texts_di = {di['section']: "" for di in body_text}
    
    for section, text in texts:
        texts_di[section] += text

    body = ""

    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"
    
    return body

def format_bib(bibs):
    if type(bibs) == dict:
        bibs = list(bibs.values())
    bibs = deepcopy(bibs)
    formatted = []
    
    for bib in bibs:
        bib['authors'] = format_authors(
            bib['authors'], 
            with_affiliation=False
        )
        formatted_ls = [str(bib[k]) for k in ['title', 'authors', 'venue', 'year']]
        formatted.append(", ".join(formatted_ls))

    return "; ".join(formatted)

def format_body_text(body_text):
    
    body = ""

    for di in body_text:
        text = di['text']
        body += text
    return body
    
    
def format_corpus_text(body_text, min_len=18, max_len=128):
    junk_text = "copyright"
    
    def remove_braces_brackets(body_text):
        body_text = re.sub(r'\([0-9]+\)', '', body_text)
        body_text = re.sub(r'\[[^)]*\]', '', body_text)
        return(body_text)
        
    body_text = remove_braces_brackets(body_text)
    text_lines = []
    token_lines = tokenize.sent_tokenize(body_text)
    for line in token_lines:
      
        words = line.split()
        if junk_text not in words:
             max_word_len = len(max(words, key=len))
             if (len(words) > min_len) and (len(words) < max_len) and max_word_len > 5:
                 text_lines.append(line)
    
    return(text_lines)

In [3]:
biorxiv_dir = './input/CORD-19-research-challenge/biorxiv_medrxiv/biorxiv_medrxiv/'
filenames = os.listdir(biorxiv_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))
all_files = []

for filename in filenames:
    filename = biorxiv_dir + filename
    file = json.load(open(filename, 'rb'))
    all_files.append(file)

Number of articles retrieved from biorxiv: 885


In [4]:

panda_text = []
corpus_text = []

for file in tqdm(all_files):
    file_text = format_body_text(file['body_text'])
    
    file_lines = format_corpus_text(file_text)
    if(len(file_lines)>5):
        corpus_text.append(file_lines)
    
    text = []
    text.append(file_text)
    features = [
        file['metadata']['title'],
        text,
    ]
    
    panda_text.append(features)

HBox(children=(IntProgress(value=0, max=885), HTML(value='')))




In [5]:
col_names = [
    'title',
    'paragraphs']

clean_df = pd.DataFrame(panda_text, columns=col_names)
clean_df.head()



Unnamed: 0,title,paragraphs
0,Multimerization of HIV-1 integrase hinges on c...,"[In the absence of a curative treatment, the h..."
1,Time-varying transmission dynamics of Novel Co...,"[Eighteen years ago, severe acute respiratory ..."
2,p53 is not necessary for DUX4 pathology,[Facioscapulohumeral muscular dystrophy (FSHD)...
3,Virological assessment of hospitalized cases o...,[Pharyngeal virus shedding was very high durin...
4,Potential impact of seasonal forcing on a SARS...,[(2.2 with 90% high density interval 1.4-3.8 (...


In [11]:
clean_df.to_csv('./input/biorxiv_temp.csv', index=False)

In [6]:
with open('./input/biorxiv_temp.txt', 'w') as corp_file:
    for lines in corpus_text:
        for line in lines:
                corp_file.write("%s\n" %line)
        corp_file.write("\n")
