In [1]:
import spacy
from os import listdir
from os.path import isfile, join
import warnings
from tqdm import tqdm
import json
import pickle
import random
import pandas as pd
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [3]:
dir_base = "document_parses/pmc_json/" # point this to the data directory

# functions below to read data files

def read_file(filename):
    input_file_text = open(filename , encoding='utf-8').read()
    return input_file_text


def read_directory_files(directory, start_num, end_num):
    files = [f for f in listdir(directory)[start_num:end_num] if isfile(join(directory, f))]
    return files

def append_files(directory,file_list):
    file_texts = []
    for f in file_list:
        file_text = read_file(join(directory, f))
        # print(file_text)
        file_texts.append( {'filename': f, 'content': json.loads(file_text)} )
    return file_texts

In [4]:
# List of file names
list_o_f = read_directory_files(dir_base, 0, 85059)

In [5]:
# pick a random sample of 50000 document filenames to form the corpus
random.seed(81)
sample_of_files = random.sample(list_o_f, k=40000)
len(set(sample_of_files))

40000

In [6]:
# Viewing filenames
sample_of_files[0:10]

['PMC7383526.xml.json',
 'PMC7329281.xml.json',
 'PMC7221011.xml.json',
 'PMC7444305.xml.json',
 'PMC7414283.xml.json',
 'PMC7267247.xml.json',
 'PMC7357501.xml.json',
 'PMC3132365.xml.json',
 'PMC7106341.xml.json',
 'PMC7101598.xml.json']

In [7]:
# the corpus is a dictionary where a key has the file names and the other key has the content which consists of the text, title, author names, etc.... Code below explors the content of the files.
text_corpus = append_files(dir_base, sample_of_files)
print(len(text_corpus), text_corpus[0].keys(), sep='\n')

40000
dict_keys(['filename', 'content'])


In [8]:
text_corpus[81]['content'].keys()

dict_keys(['paper_id', 'metadata', 'body_text', 'ref_entries', 'back_matter', 'bib_entries'])

In [9]:
text_corpus[81]['content']['metadata']['title']

'Identification of MicroRNA-Like RNAs in Mycelial and Yeast Phases of the Thermal Dimorphic Fungus Penicillium marneffei\n'

In [10]:
text_corpus[81]['content']['metadata']['title']

'Identification of MicroRNA-Like RNAs in Mycelial and Yeast Phases of the Thermal Dimorphic Fungus Penicillium marneffei\n'

In [11]:
# Building code to extract sections of documents such as the "introduction" of each document
for x in range(50):
    for i in range(len(text_corpus[x]['content']['body_text'])):
        for k,v in text_corpus[x]['content']['body_text'][i].items():
            if k == 'section' and ('introduction' in v.lower().strip().split()):
                print(k)
                print(v)
                print(text_corpus[x]['content']['body_text'][i]['text'].strip())
                print('\n')

section
INTRODUCTION
On 31 December 2019, an outbreak of respiratory disease caused by a novel coronavirus (CoV) first detected in Wuhan City, Hubei Province, China, was initially reported to the World Health Organization (WHO) and has continued to expand globally (https://www.fda.gov/emergency-preparedness-and-response/mcm-legal-regulatory-and-policy-framework/emergency-use-authorization#covidinvitrodev, accessed 8 April 2020; https://www.cdc.gov/coronavirus/2019-nCoV/summary.html). On 30 January 2020, the United States reported the first confirmed instance of person-to-person spread of SARS-CoV-2 to an individual who had had close contact with a known case (https://www.cdc.gov/coronavirus/2019-nCoV/summary.html). On 11 March 2020, the WHO declared the 2019 CoV disease (COVID-19) a pandemic (https://www.who.int/dg/speeches/detail/who-director-general-s-opening-remarks-at-the-media-briefing-on-covid-19---11-march-2020, accessed 28 April 2020).


section
INTRODUCTION
Coronaviruses are a

In [12]:
# list of dicts. Each dict has 'title/paper_id', filename and 'body' of the article's intro/background/conclusion/discussion.
pmc_entire_sample =[]
j = 1
usedj = [0]

# list of extra stopwords
other_sw = ['et', 'al', 'et.', 'al.', 'll','non','a','b','c','d','e','f','g','h','i','j','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','>','<',')','(','[','{',']','}','e.g.','=','≥','+', 'α', 'β', 'γ','',' ']

for i in tqdm(range(len(text_corpus))):
    dict1 = {}
    z = ''
    y = ''
    # extract each article----------------------------------------------------------------
    for x in range(len(text_corpus[i]['content']['body_text'])):
        for k,v in text_corpus[i]['content']['body_text'][x].items():

            try:
                # extract
                if (k == 'section' and ('introduction' in v.lower().strip().split())):
                    # print(v.strip()+' in', 'Article number: '+str(i), 'element number(x): '+str(x), sep='|')
                    # print(text_corpus[i]['content']['body_text'][x+1]['text'].strip())
                    z+= ' '+text_corpus[i]['content']['body_text'][x]['text'].strip()
                
                elif (k == 'section' and ('background' in v.lower().strip().split())):
                    # print(v.strip()+' in', 'Article number: '+str(i), 'element number(x): '+str(x), sep='|')
                    # print(text_corpus[i]['content']['body_text'][x+1]['text'].strip())
                    z+= ' '+text_corpus[i]['content']['body_text'][x]['text'].strip()

                elif (k == 'section' and ('conclusion' in v.lower().strip().split())):
                    # print(v.strip()+' in', 'Article number: '+str(i), 'element number(x): '+str(x), sep='|')
                    # print(text_corpus[i]['content']['body_text'][x+1]['text'].strip())
                    y+= ' '+text_corpus[i]['content']['body_text'][x]['text'].strip()
                
                elif (k == 'section' and ('discussion' in v.lower().strip().split())):
                    # print(v.strip()+' in ', 'Article number: '+str(i), 'element number: '+str(x), sep='|')
                    # print(text_corpus[i]['content']['body_text'][x+1]['text'].strip())
                    y+= ' '+text_corpus[i]['content']['body_text'][x]['text'].strip()

            except IndexError:
                print('IndexError out of range')


    # run through spacy's nlp------------------------------------------------------------------
    condtion = True
    if (('the' in z.split(' ')) or ('and' in z.split(' '))) and (len(y)!=0) and (len(z)!=0): # check at least two of the sections were captured
        z+=' '+y
        # print(z)
        docs = nlp(z) # use spacy NLP to tokenize text
        filtered_sent=[]
        for word in docs:
            # filter out stop words, punctiuation, currency, urls and numbers not attached to other characters.
            if word.is_stop==False and word.is_punct==False and (word.is_currency ==False) and (str(word) not in other_sw) and (word.like_url ==False) and (word.like_num ==False) and (len(str(word))!=0):
                # lemmatize, lower case and strip each word.
                lem = word.lemma_.lower().strip()
                # append each word in a list
                filtered_sent.append(lem)
        # attach to dict the text, title of the paper and filename (has unique pmc code)
        dict1['body'] = filtered_sent
        dict1['title'] = [ text_corpus[i]['content']['metadata']['title'], text_corpus[i]['content']['paper_id'] ]
        dict1['filename'] = text_corpus[i]['filename']
    else:
        condtion = False
    
    if condtion == True:
        pmc_entire_sample.append(dict1)
        j+=1
        # print(j)
    # Back up saves
    if j in list(range(5000,55000,5000)):
        j_intial = usedj[-1]
        with open('saved_output/processed_corpus/'+str(j)+'.pkl', 'wb') as f:
          pickle.dump(pmc_entire_sample[j_intial:j], f)
          f.close()
        usedj.append(j)

100%|██████████| 40000/40000 [1:30:07<00:00,  7.40it/s]


In [13]:
# check resulting corpus
print(pmc_entire_sample[2])
print(len(pmc_entire_sample))
print(pmc_entire_sample[2]['body'], pmc_entire_sample[2]['filename'], pmc_entire_sample[2]['title'], sep='\n\n')

{'body': ['novel', 'coronavirus', 'describe', 'wuhan', 'china', 'december', 'lead', 'coronavirus', 'disease', 'covid-19', 'pandemic', 'global', 'economic', 'shutdown', 'amid', 'unprecedented', 'social', 'distancing', 'measure', 'clinical', 'spectrum', 'covid-19', 'range', 'asymptomatic', 'infection', 'mild', 'upper', 'respiratory', 'tract', 'illness', 'majority', 'patient', 'severe', 'viral', 'pneumonia', 'respiratory', 'failure', 'multiorgan', 'failure', 'death', '2–4', 'initial', 'indication', 'old', 'adult', 'people', 'underlying', 'health', 'condition', 'great', 'risk', 'severe', 'illness', '5–7', 'host', 'immune', 'system', 'response', 'important', 'determinant', 'disease', 'progression', 'outcome', 'virus', 'cause', 'covid-19', 'belong', 'sarbecovirus', 'subgenus', 'genus', 'betacoronavirus', 'severe', 'acute', 'respiratory', 'syndrome', 'relate', 'coronavirus', 'sars', 'cov', 'designate', 'sars', 'cov-2', '8)', 'coronaviruse', 'contain', 'structural', 'protein', 'include', 'spik

In [14]:
print(pmc_entire_sample[1000]['body'], pmc_entire_sample[1000]['filename'], pmc_entire_sample[1000]['title'], sep='\n\n')

['novel', 'coronavirus', 'covid-19', 'outbreak', 'start', 'china', 'december', 'case', 'covid-19', 'united', 'states', 'confirm', 'january', 'subsequently', 'outbreak', 'worldwide', 'pandemic', 'global', 'public', 'health', 'emergency', 'general', 'child', 'report', 'severe', 'presentation', 'well', 'prognosis', 'compare', 'adult', 'covid-19', 'infection', 'adult', 'acute', 'kidney', 'injury', 'need', 'dialysis', 'commonly', 'see', 'critical', 'patient', 'covid-19', 'infection', 'novel', 'coronavirus', 'act', 'angiotensin', 'convert', 'enzyme', 'ace2', 'body', 'cell', 'animal', 'study', 'show', 'high', 'expression', 'ace2', 'tubular', 'epithelial', 'cell', 'podocyte', 'kidney', 'report', 'summarize', 'renal', 'pathological', 'finding', 'postmortem', 'kidney', 'specimen', 'china', 'show', 'presence', 'covid-19', 'virus', 'particle', 'tubular', 'epithelial', 'cell', 'podocyte', 'detect', 'electron', 'microscopy', 'involvement', 'podocyte', 'glomerular', 'filtration', 'barrier', 'virus', 

The second model corpus will constitute of the abstract as provided in the metadata.csv file which lists all of the documents in the data and their abstract in a csv table.

In [19]:
with open('saved_output/processed_corpus/pmc_entire_sample.pkl', 'wb') as f:
    pickle.dump(pmc_entire_sample, f)