# Goal:

Create a data frame that stores abstract of each research paper

# Data Preprocessing

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import os

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [14]:
current_folder = os.getcwd()

In [15]:
root_path = os.path.join(current_folder, 'data/')

In [19]:
metadata_path = os.path.join(root_path, 'metadata.csv')

In [20]:
metadata_path

'/Users/nhungle/Desktop/repos/Data-Science-Projects/CORD-19-research-challenge/data/metadata.csv'

In [39]:
meta_df = pd.read_csv(metadata_path,
                     dtype={
                         'pubmed_id': str,
                         'Microsoft Academic Paper ID': str, 
                         'doi': str
                     })

In [41]:
meta_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 15 columns):
sha                            28462 non-null object
source_x                       44220 non-null object
title                          43996 non-null object
doi                            40750 non-null object
pmcid                          23319 non-null object
pubmed_id                      22943 non-null object
license                        44220 non-null object
abstract                       35806 non-null object
publish_time                   34197 non-null object
authors                        41074 non-null object
journal                        33173 non-null object
Microsoft Academic Paper ID    964 non-null object
WHO #Covidence                 1767 non-null object
has_full_text                  44220 non-null bool
full_text_file                 32829 non-null object
dtypes: bool(1), object(14)
memory usage: 4.8+ MB


In [42]:
all_json= glob.glob(f'{root_path}/**/*.json', recursive=True)
len(all_json)

29315

In [192]:
class FileReader:
    def __init__(self, file_path):
        with open(file_path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = []
            self.body_text = []
            # Abstract
            for entry in content['abstract']:
                self.abstract.append(entry['text'])
            # Body text
            for entry in content['body_text']:
                self.body_text.append(entry['text'])
            self.abstract = '\n'.join(self.abstract)
            self.body_text = '\n'.join(self.body_text)
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'


In [193]:
first_entry = FileReader(all_json[0])
print(first_entry)

ab680d5dbc4f51252da3473109a7885dd6b5eb6f: ... The evolutionary history of humans is characterized by dynamic shifts in population density and the structure of our social contact networks. Agriculture, the advent of City-States, European expansion...


In [197]:
len(first_entry.abstract)

0

In [66]:
def get_breaks(content, length):
    data = ""
    words = content.split(' ')
    total_chars = 0

    # add break every length characters
    for i in range(len(words)):
        total_chars += len(words[i])
        if total_chars > length:
            data = data + "<br>" + words[i]
            total_chars = 0
        else:
            data = data + " " + words[i]
    return data

In [68]:
first_entry.paper_id

'ab680d5dbc4f51252da3473109a7885dd6b5eb6f'

In [102]:
type(meta_df['authors'][1])==str

True

In [98]:
meta_df['authors'].apply(lambda x: type(x)=='float')

0        False
1        False
2        False
3        False
4        False
         ...  
44215    False
44216    False
44217    False
44218    False
44219    False
Name: authors, Length: 44220, dtype: bool

In [178]:
dict_ = {'paper_id': [], 'abstract': [],
         'body_text': [], 'authors': [],
         'title': [], 'journal': [],
         'abstract_summary': []}
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    content = FileReader(entry)
    
    # get metadata information
    meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
    
    # no metadata, skip this paper
    if len(meta_data) == 0:
        continue
    
    
    dict_['paper_id'].append(content.paper_id)
    dict_['abstract'].append(content.abstract)
    dict_['body_text'].append(content.body_text)
    
    # get abstract summary

    if len(content.abstract) == 0:
        dict_['abstract_summary'].append("Not provided")
        
    elif len(content.abstract.split(' ')) > 100:
        info = content.abstract.split(' ')[:100]
        # add break every 40 characters
        summary = get_breaks(' '.join(info), 40)
        dict_['abstract_summary'].append(summary + "...")
        
    else:
        # abstract is short enough
        summary = get_breaks(content.abstract, 40)
        dict_['abstract_summary'].append(summary)
        
        

    
    # get authors
    try:
        authors_list = meta_data['authors'].values[0].split(';')
        if len(authors_list) > 2:
            dict_['authors'].append('; '.join(authors_list[:2]) + '...')
            
        else:
            dict_['authors'].append('; '.join(authors_list))
    except Exception as e:
        # if only one author - or Null value
        dict_['authors'].append(meta_data['authors'].values[0])
        
        
        
    # get title
    try:
        title = get_breaks(meta_data['title'].values[0], 40)
        dict_['title'].append(title)
    # if title was not provided
    except Exception as e:
        dict_['title'].append(meta_data['title'].values[0])
    
    # add the journal information
    dict_['journal'].append(meta_data['journal'].values[0])
            
    

Processing index: 0 of 29315
Processing index: 2931 of 29315
Processing index: 5862 of 29315
Processing index: 8793 of 29315
Processing index: 11724 of 29315
Processing index: 14655 of 29315
Processing index: 17586 of 29315
Processing index: 20517 of 29315
Processing index: 23448 of 29315
Processing index: 26379 of 29315
Processing index: 29310 of 29315


In [179]:
dict_.keys()

dict_keys(['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])

In [180]:
for key in dict_.keys():
    print(key, len(dict_[key]))

paper_id 27678
abstract 27678
body_text 27678
authors 27678
title 27678
journal 27678
abstract_summary 27678


In [181]:
covid19_df = pd.DataFrame(dict_,
                         columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])


In [158]:
import pickle
pickle.dump(covid19_df, open(os.path.join(root_path, 'covid19_literatur.pickle'), "wb"))

In [172]:
meta_df.loc[meta_df['sha'] == 'ab680d5dbc4f51252da3473109a7885dd6b5eb6f']['abstract']

2707    Abstract This article discusses how evolutiona...
Name: abstract, dtype: object

In [186]:
covid19_df.shape

(27678, 7)

In [185]:
covid19_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27678 entries, 0 to 27677
Data columns (total 7 columns):
paper_id            27678 non-null object
abstract            27678 non-null object
body_text           27678 non-null object
authors             26929 non-null object
title               27634 non-null object
journal             26784 non-null object
abstract_summary    27678 non-null object
dtypes: object(7)
memory usage: 1.5+ MB


In [189]:
covid19_df.iloc[0]['abstract']

''