In [1]:
from collections import defaultdict
import json
import pandas as pd
from pathlib import Path
import re
import vaex

In [65]:
#Data Paths
root_path = Path.cwd().parents[0] 
data_dir = root_path / "2020-03-13"
processed_dir = root_path / "processed"
processed_dir.mkdir(exist_ok=True)

Load the metadata

In [3]:
meta_df = pd.read_csv(data_dir / "all_sources_metadata_2020-03-13.csv")

In [4]:
meta_df.head()

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,CZI,Angiotensin-converting enzyme 2 (ACE2) as a SA...,10.1007/s00134-020-05985-9,,32125455.0,cc-by-nc,,2020,"Zhang, Haibo; Penninger, Josef M.; Li, Yimin; ...",Intensive Care Med,2002765000.0,#3252,True
1,53eccda7977a31e3d0f565c884da036b1e85438e,CZI,Comparative genetic analysis of the novel coro...,10.1038/s41421-020-0147-1,,,cc-by,,2020,"Cao, Yanan; Li, Lin; Feng, Zhimin; Wan, Shengq...",Cell Discovery,3003431000.0,#1861,True
2,210a892deb1c61577f6fba58505fd65356ce6636,CZI,Incubation Period and Other Epidemiological Ch...,10.3390/jcm9020538,,,cc-by,The geographic spread of 2019 novel coronaviru...,2020,"Linton, M. Natalie; Kobayashi, Tetsuro; Yang, ...",Journal of Clinical Medicine,3006065000.0,#1043,True
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,CZI,Characteristics of and Public Health Responses...,10.3390/jcm9020575,,32093211.0,cc-by,"In December 2019, cases of unidentified pneumo...",2020,"Deng, Sheng-Qun; Peng, Hong-Juan",J Clin Med,177663100.0,#1999,True
4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,"Zhang, Wei",Intensive Care Med,3006643000.0,#3242,False


In [5]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29500 entries, 0 to 29499
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   sha                          17420 non-null  object 
 1   source_x                     29500 non-null  object 
 2   title                        29130 non-null  object 
 3   doi                          26357 non-null  object 
 4   pmcid                        27337 non-null  object 
 5   pubmed_id                    16730 non-null  float64
 6   license                      17692 non-null  object 
 7   abstract                     26553 non-null  object 
 8   publish_time                 18248 non-null  object 
 9   authors                      28554 non-null  object 
 10  journal                      17791 non-null  object 
 11  Microsoft Academic Paper ID  1134 non-null   float64
 12  WHO #Covidence               1236 non-null   object 
 13  has_full_text   

Not all papers have a title or abstract. Do all of them have at least one or the other?

In [6]:
meta_df[meta_df['title'].notnull() | meta_df['abstract'].notnull()].shape

(29130, 14)

No, looks like the limiting factor is the title

In [7]:
meta_df[meta_df['title'].isnull()]

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
18434,,PMC,,,PMC6824915,,,,,,,,,
23727,,PMC,,,PMC2762764,20025201.0,CC BY,,2009 Sep 9,,PLoS Curr,,,
23765,,PMC,,,PMC2762335,20020673.0,CC BY,,2009 Aug 21,,PLoS Curr,,,
25385,,PMC,,,PMC2080411,,,,,,,,,
25598,,PMC,,,PMC1246105,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29495,c42a617a00afe6a36bde0a8e3638e0f55bfee4f7,medrxiv,,doi.org/10.1101/2020.03.08.20032847,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29496,c4ce14ce42fa4360dfe3515ec9d1584847381c27,medrxiv,,doi.org/10.1101/2020.03.08.20032854,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True
29497,c41e09a32be90c84cea0616bb1c726aecba721e0,medrxiv,,doi.org/10.1101/2020.03.09.20032219,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,False
29498,ca88735399ff43d0e673876200655099f06f5567,medrxiv,,doi.org/10.1101/2020.03.09.20033183,,,See https://www.medrxiv.org/submit-a-manuscript,,,,,,,True


Some of the papers without title seem to have full text. In our processing, we will filter out papers that have no textual fields, eg title, abstract, or full text

In [8]:
meta_df[meta_df['title'].isnull() & meta_df['abstract'].isnull() & meta_df['has_full_text'].isnull()]

Unnamed: 0,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
18434,,PMC,,,PMC6824915,,,,,,,,,
23727,,PMC,,,PMC2762764,20025201.0,CC BY,,2009 Sep 9,,PLoS Curr,,,
23765,,PMC,,,PMC2762335,20020673.0,CC BY,,2009 Aug 21,,PLoS Curr,,,
25385,,PMC,,,PMC2080411,,,,,,,,,
25598,,PMC,,,PMC1246105,,,,,,,,,
25648,,PMC,,,PMC1188130,,,,,,,,,
26186,,PMC,,,PMC404520,,,,,,,,,
26340,,PMC,,,PMC286345,,,,,,,,,
26629,,PMC,,,PMC1125910,,,,,,,,,


In [9]:
meta_df.groupby('source_x').count()

Unnamed: 0_level_0,sha,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text
source_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
CZI,167,1236,1177,0,404,172,848,1164,1191,1198,1134,1236,167
PMC,16326,27328,24253,27337,16326,16593,25139,16518,26797,16593,0,0,16326
biorxiv,566,566,566,0,0,566,566,566,566,0,0,0,566
medrxiv,361,0,361,0,0,361,0,0,0,0,0,0,361


In [10]:
all_jsons = list(data_dir.glob('**/*.json'))

In [11]:
len(all_jsons)

13202

There is metadata for 29500 entries, but 13202 json files.

In [12]:
all_jsons[0]

PosixPath('/Users/akdidier/Documents/CORD/2020-03-13/pmc_custom_license/pmc_custom_license/8f8eb4f004c2002face0723f2f58cc411954d36e.json')

In [13]:
data_sets = ["biorxiv_medrxiv", "comm_use_subset", "noncomm_use_subset", "pmc_custom_license"]
jsons = []
for d in data_sets:
    jsons.extend((data_dir / d).glob("**/*.json"))

In [14]:
jsons[0]

PosixPath('/Users/akdidier/Documents/CORD/2020-03-13/biorxiv_medrxiv/biorxiv_medrxiv/f905f78b32f63c6d14a79984dfb33f1b358b8ab4.json')

In [15]:
len(jsons)

13202

In [16]:
with open(jsons[0], "r") as f:
    data = json.load(f)

In [17]:
data

{'paper_id': 'f905f78b32f63c6d14a79984dfb33f1b358b8ab4',
 'metadata': {'title': 'Multimerization of HIV-1 integrase hinges on conserved SH3-docking platforms',
  'authors': [{'first': 'Meytal',
    'middle': [],
    'last': 'Galilee',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 'Technion -Israel Institute of Technology',
     'location': {'postCode': '320003, 320003',
      'settlement': 'Haifa, Haifa',
      'country': 'Israel., Israel'}},
    'email': ''},
   {'first': 'Akram',
    'middle': [],
    'last': 'Alian',
    'suffix': '',
    'affiliation': {'laboratory': '',
     'institution': 'Technion -Israel Institute of Technology',
     'location': {'postCode': '320003, 320003',
      'settlement': 'Haifa, Haifa',
      'country': 'Israel., Israel'}},
    'email': 'alian@technion.ac.il'}]},
 'abstract': [{'text': 'New anti-AIDS treatments must be continually developed in order to overcome resistance mutations including those emerging in the newest th

In [18]:
data.keys()

dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])

In [19]:
obj = data['body_text']

In [20]:
len(obj[0]['text'])

1282

In [21]:
obj[0]['text']

"In the absence of a curative treatment, the highly active antiretroviral therapy (HAART) keeps the HIV-1 virus of AIDS patients under control. HAART combines drugs targeting different stages of viral replication including the integration step catalyzed by the integrase protein (IN) (1) . Integration of viral DNA into host genome involves two steps catalyzed by IN: (i) cleavage of a dinucleotide from each 3'-end of the viral DNA (3'processing), and (ii) insertion of this processed viral DNA into the host DNA (strand-transfer) (2) . Clinical IN strand transfer inhibitors (INSTIs) target the catalytic site of the enzyme to specifically inhibit the DNA joining reaction, however, as with all anti-AIDS treatments, the continued success of these drugs is persistently disrupted by resistance mutations (1, 2) . Although 3'-processing can be carried out by monomeric IN (3) , the assembly of IN functional multimers is imperative for the strand-transfer activity (4) (5) (6) (7) (8) , and for viru

In [22]:
obj[1]['text'][436:439]

'(4)'

In [92]:
obj[0]['cite_spans']

[{'start': 283, 'end': 286, 'text': '(1)', 'ref_id': 'BIBREF0'},
 {'start': 531, 'end': 534, 'text': '(2)', 'ref_id': 'BIBREF1'},
 {'start': 805, 'end': 808, 'text': '(1,', 'ref_id': 'BIBREF0'},
 {'start': 809, 'end': 811, 'text': '2)', 'ref_id': 'BIBREF1'},
 {'start': 869, 'end': 875, 'text': 'IN (3)', 'ref_id': 'BIBREF2'},
 {'start': 965, 'end': 968, 'text': '(4)', 'ref_id': 'BIBREF3'},
 {'start': 969, 'end': 972, 'text': '(5)', 'ref_id': 'BIBREF4'},
 {'start': 973, 'end': 976, 'text': '(6)', 'ref_id': 'BIBREF5'},
 {'start': 977, 'end': 980, 'text': '(7)', 'ref_id': 'BIBREF6'},
 {'start': 981, 'end': 984, 'text': '(8)', 'ref_id': 'BIBREF7'},
 {'start': 1049, 'end': 1052, 'text': '(9,', 'ref_id': 'BIBREF8'},
 {'start': 1053, 'end': 1056, 'text': '10)', 'ref_id': 'BIBREF9'},
 {'start': 1261, 'end': 1265, 'text': '(11)', 'ref_id': 'BIBREF10'},
 {'start': 1266, 'end': 1270, 'text': '(12)', 'ref_id': 'BIBREF11'},
 {'start': 1271, 'end': 1275, 'text': '(13)', 'ref_id': 'BIBREF12'},
 {'star

In [23]:
data['body_text'][2]

{'text': 'Hindering the assembly of IN functional multimers is only one side of the coin. Allosteric interference has also been shown to promote the formation of aberrant IN multimers and aggregates. The potential of allosteric IN inhibitors has been demonstrated through the thorough characterization of the "LEDGF pocket" formed at the dimer interface of IN and the development of LEDGIN (or ALLINI) inhibitors that bind to it (11) (Figure 1A) . Although less investigated, other IN pockets capable of allosteric inhibitor binding have also been identified ( Figure 1A) : binding of the "Y3" molecule to a pocket near the N-terminal end of CCD α-helix 4, designated Y3pocket, has been shown to inhibit 3\'-processing and strand-transfer activities (16) ; the "sucrose" binding pocket found along the CCD dimer interface and flanked by two LEDGF pockets (17, 18) has recently been targeted by the natural product kuwanon-L, which inhibited IN activity in a pattern similar to LEDGINs (19) . Another 

In [32]:
obj = data['body_text'][2]
span_keys = ["cite_spans", "ref_spans", "eq_spans"]
all_spans = []
for key in span_keys:
    if key in obj.keys():
        all_spans.extend(obj[key])
#sort all_spans by start value
all_spans = sorted(all_spans, key=lambda obj: obj['start'])
print(all_spans)
text = obj['text']
print(f"len text: {len(text)}")
keep_spans = []
for i in range(len(all_spans)):
    span = all_spans[i]
    if i == 0:
        keep_spans.append((0, span['start']))
    else:
        prev_span = all_spans[i-1]
        if prev_span['end'] + 1 == span['start']:
            continue
        else:
            keep_spans.append((prev_span['end'], span['start']))

#add the end of the article
keep_spans.append((all_spans[-1]['end'], len(text)))
print(keep_spans)
for span in keep_spans:
    print(span)
    print(text[span[0]:span[1]])
#remove all the citations
# new_text = "".join([text[span[0]:span[1]] for span in keep_spans])
# #remove extra whitespace
# new_text = re.sub("\s+", " ", new_text)
# print(new_text)

[{'start': 419, 'end': 423, 'text': '(11)', 'ref_id': 'BIBREF10'}, {'start': 424, 'end': 435, 'text': '(Figure 1A)', 'ref_id': None}, {'start': 551, 'end': 561, 'text': 'Figure 1A)', 'ref_id': None}, {'start': 739, 'end': 743, 'text': '(16)', 'ref_id': 'BIBREF15'}, {'start': 844, 'end': 848, 'text': '(17,', 'ref_id': 'BIBREF16'}, {'start': 849, 'end': 852, 'text': '18)', 'ref_id': 'BIBREF17'}, {'start': 974, 'end': 978, 'text': '(19)', 'ref_id': 'BIBREF18'}, {'start': 1071, 'end': 1075, 'text': '(20)', 'ref_id': 'BIBREF19'}, {'start': 1297, 'end': 1300, 'text': '(3)', 'ref_id': 'BIBREF2'}, {'start': 1454, 'end': 1458, 'text': '(21)', 'ref_id': 'BIBREF20'}, {'start': 1586, 'end': 1590, 'text': '(22)', 'ref_id': 'BIBREF21'}]
len text: 1719
[(0, 419), (435, 551), (561, 739), (743, 844), (852, 974), (978, 1071), (1075, 1297), (1300, 1454), (1458, 1586), (1590, 1719)]
(0, 419)
Hindering the assembly of IN functional multimers is only one side of the coin. Allosteric interference has also be

In [30]:
text[844:852]

'(17, 18)'

In [50]:
class JsonReader():
    def __init__(self, file_path, remove_citations=True):
        with open(file_path, "r") as f:
            data = json.load(f)
        self.extracted = defaultdict(str)
        
        text_keys = ['abstract', 'body_text'] #These require processing to clean citations
        for key in text_keys:
            if key in data.keys():
                text_list = data[key]
                cleaned_text = [obj['text'] for obj in text_list]    
                if remove_citations:
                    text_list = data[key]
                    cleaned_text = list(map(self._remove_citations, text_list))
                    self.extracted[key] += "\n ".join(cleaned_text)

        self.extracted["paper_id"] = data["paper_id"]
        if "title" in data.keys():
            self.extracted["title"] = data["title"]  

        
    def _remove_citations(self, obj: dict):
        """
        Text cleaning function to remove the citation spans from the text. 
        Args:
            obj: dict, A dictionary of format
                {
                "text": <str>,
                "cite_spans": [             # list of character indices of inline citations
                                            # e.g. citation "[7]" occurs at positions 151-154 in "text"
                                            #      linked to bibliography entry BIBREF3
                    {
                        "start": 151,
                        "end": 154,
                        "text": "[7]",
                        "ref_id": "BIBREF3"
                    },
                    ...
                ],
                "ref_spans": <list of dicts similar to cite_spans>,     # e.g. inline reference to "Table 1"
                "section": "Abstract"
            }
        """
        #Get a flat list of all the spans to remove
        span_keys = ["cite_spans", "ref_spans", "eq_spans"]
        all_spans = []
        for key in span_keys:
            if key in obj.keys():
                all_spans.extend(obj[key])
        
        #sort all_spans by start value
        all_spans = sorted(all_spans, key=lambda obj: obj['start'])
        text = obj['text']
        
        if all_spans:
            #Compute the complement: the list of indices that spans the text to be kept
            keep_spans = []
            for i in range(len(all_spans)):
                span = all_spans[i]
                if i == 0:
                    keep_spans.append((0, span['start']))
                else:
                    prev_span = all_spans[i-1]
                    if prev_span['end'] + 1 == span['start']:
                        continue
                    else:
                        keep_spans.append((prev_span['end'], span['start']))
            #add the end of the article to the spans
            keep_spans.append((all_spans[-1]['end'], len(text)))

            #remove all the citations
            new_text = "".join([text[span[0]:span[1]] for span in keep_spans])
            #remove extra whitespace
            new_text = re.sub("\s+", " ", new_text)
            return new_text
        
        else:
            return text

In [51]:
reader = JsonReader(all_jsons[1])
reader.extracted

defaultdict(str,
            {'abstract': '',
             'body_text': 'I nfectious diseases have been an ever-present threat to mankind. From the biblical plagues and the Plague of Athens in ancient times, to the Black Death of the Middle Ages, the 1918 "Spanish Flu" pandemic, and more recently, the HIV/AIDS pandemic, infectious diseases have continued to emerge and reemerge in a manner that defies accurate predictions .\n The past 10 years have been no exception, as many new and reemerging microbial threats have continued to challenge the public health and infectious disease research communities worldwide. Since 1994, when Emerging Infectious Diseases made its publication debut, significant strides in the global fight against the HIV/AIDS pandemic have been made. The infectious disease community has confronted several other newly emerging pathogens, such as the severe acute respiratory syndrome-associated coronavirus (SARS-CoV), henipaviruses (Hendra and Nipah), and, most recently, 

In [52]:
def get_extraction(json_file):
    reader = JsonReader(json_file)
    return reader.extracted

In [53]:
%time

extractions = list(map(get_extraction, all_jsons))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.87 µs


In [54]:
df = pd.DataFrame(extractions)

In [55]:
df.head()

Unnamed: 0,abstract,body_text,paper_id
0,Bordetella bronchiseptica isolate KM22 has bee...,20-kb insert library preparation protocol (htt...,8f8eb4f004c2002face0723f2f58cc411954d36e
1,,I nfectious diseases have been an ever-present...,63f7049d200896290b38b38711113054f7ea1b50
2,The influenza A nucleoprotein (NP) is an attra...,The transmission of a pathogenic avian H5N1 vi...,4df45b8404d9de0b376a8ae3c282a517df36fe51
3,The outbreak of severe acute respiratory syndr...,acute lung injury; inflammatory response; neut...,e0737ee93afe7b0bf06b1e3f9adf21d541dd10f0
4,The prevalence of feline herpesvirus-1 (FHV-1)...,Feline herpesvirus type 1 (FHV-1) is the most ...,3c3572ba243d61e7631725669c8f88347fdbd5bc


In [56]:
df.columns

Index(['abstract', 'body_text', 'paper_id'], dtype='object')

In [60]:
df = df.rename(columns={'paper_id': 'sha'})

In [63]:
#merge with metadata DOI
df = df.merge(meta_df[["doi", 'sha', 'source_x', 'journal', 'has_full_text']], on='sha')
df.head()

Unnamed: 0,abstract,body_text,sha,doi_x,doi_y,source_x,journal,has_full_text
0,Bordetella bronchiseptica isolate KM22 has bee...,20-kb insert library preparation protocol (htt...,8f8eb4f004c2002face0723f2f58cc411954d36e,http://dx.doi.org/10.1128/MRA.01207-19,http://dx.doi.org/10.1128/MRA.01207-19,PMC,Microbiol Resour Announc,True
1,,I nfectious diseases have been an ever-present...,63f7049d200896290b38b38711113054f7ea1b50,http://dx.doi.org/10.3201/eid1104.041167,http://dx.doi.org/10.3201/eid1104.041167,PMC,Emerg Infect Dis,True
2,The influenza A nucleoprotein (NP) is an attra...,The transmission of a pathogenic avian H5N1 vi...,4df45b8404d9de0b376a8ae3c282a517df36fe51,http://dx.doi.org/10.1051/vetres/2009071,http://dx.doi.org/10.1051/vetres/2009071,PMC,Vet Res,True
3,The outbreak of severe acute respiratory syndr...,acute lung injury; inflammatory response; neut...,e0737ee93afe7b0bf06b1e3f9adf21d541dd10f0,http://dx.doi.org/10.1038/labinvest.2012.92,http://dx.doi.org/10.1038/labinvest.2012.92,PMC,Lab Invest,True
4,The prevalence of feline herpesvirus-1 (FHV-1)...,Feline herpesvirus type 1 (FHV-1) is the most ...,3c3572ba243d61e7631725669c8f88347fdbd5bc,http://dx.doi.org/10.4142/jvs.2008.9.2.207,http://dx.doi.org/10.4142/jvs.2008.9.2.207,PMC,J Vet Sci,True


In [66]:
df.to_parquet(processed_dir / "documents_processed.parquet")