# Goal:

Create a data frame that stores abstract of each research paper

# Data Preprocessing

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import os

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [14]:
current_folder = os.getcwd()

In [15]:
root_path = os.path.join(current_folder, 'data/')

In [19]:
metadata_path = os.path.join(root_path, 'metadata.csv')

In [20]:
metadata_path

'/Users/nhungle/Desktop/repos/Data-Science-Projects/CORD-19-research-challenge/data/metadata.csv'

In [39]:
meta_df = pd.read_csv(metadata_path,
                     dtype={
                         'pubmed_id': str,
                         'Microsoft Academic Paper ID': str, 
                         'doi': str
                     })

In [41]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 15 columns):
sha                            28462 non-null object
source_x                       44220 non-null object
title                          43996 non-null object
doi                            40750 non-null object
pmcid                          23319 non-null object
pubmed_id                      22943 non-null object
license                        44220 non-null object
abstract                       35806 non-null object
publish_time                   34197 non-null object
authors                        41074 non-null object
journal                        33173 non-null object
Microsoft Academic Paper ID    964 non-null object
WHO #Covidence                 1767 non-null object
has_full_text                  44220 non-null bool
full_text_file                 32829 non-null object
dtypes: bool(1), object(14)
memory usage: 4.8+ MB


In [42]:
all_json= glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

29315

In [67]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_entry = FileReader(all_json[0])
print(first_entry)

ab680d5dbc4f51252da3473109a7885dd6b5eb6f: ... The evolutionary history of humans is characterized by dynamic shifts in population density and the structure of our social contact networks. Agriculture, the advent of City-States, European expansion...


In [66]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [68]:
first_entry.paper_id

'ab680d5dbc4f51252da3473109a7885dd6b5eb6f'

In [102]:
type(meta_df['authors'][1])==str

True

In [98]:
meta_df['authors'].apply(lambda x: type(x)=='float')

0        False
1        False
2        False
3        False
4        False
         ...  
44215    False
44216    False
44217    False
44218    False
44219    False
Name: authors, Length: 44220, dtype: bool

In [150]:
dict_ = {'paper_id': [], 'abstract': [],
         'body_text': [], 'authors': [],
         'title': [], 'journal': [],
         'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # get abstract summary

    if len(content.abstract) == 0:
        dict_['abstract_summary'].append("Not provided")
        
    elif len(content.abstract.split(' ')) > 100:
        info = content.abstract.split(' ')[:100]
        # add break every 40 characters
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
        
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
        

    
    # get authors
    try:
        authors_list = meta_data['authors'].values[0].split(';')
        if len(authors_list) > 2:
            dict_['authors'].append('; '.join(authors_list[:2]) + '...')
            
        else:
            dict_['authors'].append('; '.join(authors_list))
    except Exception as e:
        # if only one author - or Null value
        dict_['authors'].append(meta_data['authors'].values[0])
        
        
        
    # get title
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
            
    

Processing index: 0 of 29315
Processing index: 2931 of 29315
Processing index: 5862 of 29315
Processing index: 8793 of 29315
Processing index: 11724 of 29315
Processing index: 14655 of 29315
Processing index: 17586 of 29315
Processing index: 20517 of 29315
Processing index: 23448 of 29315
Processing index: 26379 of 29315
Processing index: 29310 of 29315


In [151]:
dict_.keys()

dict_keys(['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])

In [152]:
for key in dict_.keys():
    print(key, len(dict_[key]))

paper_id 27678
abstract 27678
body_text 27678
authors 27678
title 27678
journal 27678
abstract_summary 27678


In [153]:
covid19_df = pd.DataFrame(dict_,
                         columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])


In [154]:
covid19_df 

Unnamed: 0,paper_id,abstract,body_text,authors,title,journal,abstract_summary
0,ab680d5dbc4f51252da3473109a7885dd6b5eb6f,,The evolutionary history of humans is characte...,"Scarpino, S.V.",Evolutionary Medicine IV. Evolution and<br>Em...,Encyclopedia of Evolutionary Biology,Not provided
1,6599ebbef3d868afac9daa4f80fa075675cf03bc,"International aviation is growing rapidly, res...","Sixty years ago, civil aviation was an infant ...","Macintosh, Andrew; Wallace, Lailey",International aviation emissions to 2025: Can...,Energy Policy,"International aviation is growing rapidly,<br..."
2,eb5c7f3ff921ad6469b79cc8a3c122648204ece4,,Acute infections of the gastrointestinal tract...,"Booth, I.W.; McNeish, A.S.",Mechanisms of diarrhoea,Baillière's Clinical Gastroenterology,Not provided
3,b87b790c96c75faa22a085cb560f7b3d8e018b24,,"There are three domains of life-Bacteria, Arch...","Louten, Jennifer",Chapter 3 Features of Host Cells Cellular and...,Essential Human Virology,Not provided
4,68c0bb1989b6ca2b38da32a0d992027db39f80bc,Hong Kong's new Police Commissioner Chris Tang...,"It is also noteworthy that Tang, who was once ...","Hui, Victoria Tin-bor",Beijing's Hard and Soft Repression in Hong Kong,Orbis,Hong Kong's new Police Commissioner Chris Tan...
...,...,...,...,...,...,...,...
27673,228650bc0429064d800d4b9c5fb0e00c2533a579,We hypothesized that postnatal development of ...,Early nutritional environment affects long ter...,"Harlow, KaLynn; Ferreira, Christina R....",Lipidome profiles of postnatal day 2 vaginal<...,PLoS One,We hypothesized that postnatal development of...
27674,2246e28681bde69c65dc9081df367bb661997f19,"Venereal syphilis is a multi-stage, sexually t...",Syphilis is a sexually transmitted disease (ST...,"Cruz, Adriana R.; Pillay, Allan...","Secondary Syphilis in Cali, Colombia: New<br>...",PLoS Negl Trop Dis,"Venereal syphilis is a multi-stage, sexually<..."
27675,577c6a13f9ef70e9756890fc66e98f537c01ac0a,The emergence of Middle East respiratory syndr...,Scientific RepoRts | 6:21878 | DOI: 10 .1038/s...,"Munster, Vincent J.; Adney, Danielle R....",Replication and shedding of MERS-CoV in<br>Ja...,Sci Rep,The emergence of Middle East respiratory<br>s...
27676,c5c2bc7a07670d6fb970d84a59aab3832752a3f1,We have previously shown that the infection of...,Arenaviruses are enveloped RNA viruses contain...,"Brunetti, Jesús E.; Foscaldi, Sabrina...",Role of the ERK1/2 Signaling Pathway in the<b...,Viruses,We have previously shown that the infection o...
