In [1]:
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm
tqdm.monitor_interval = 0
from Bio import Entrez, Medline
Entrez.email = 'Ricky.Li@apothecom.com'

In [2]:
query = '(COVID-19 OR SARS-CoV-2 OR Corona virus OR Coronavirus) AND ("2020/03/15"[PDat]:"2020/03/24"[PDat]) AND English[lang]))'

In [3]:
handle = Entrez.esearch(db='pubmed', retmax=99999, term=query, rettype='xml')
record = Entrez.read(handle)
id_list = list(record['IdList'])
master_id_list = id_list
print(query + ' - ' + str(len(id_list)) + ' result(s).')

# Fetch full search result from PubMed
def FetchFullResults(idlist):
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text")
    records = Medline.parse(handle)
    records_list = list(records)
    return records_list

full_records = FetchFullResults(master_id_list)
df_full_records = pd.DataFrame(full_records)
df_full_records.columns

(COVID-19 OR SARS-CoV-2 OR Corona virus OR Coronavirus) AND ("2020/03/15"[PDat]:"2020/03/24"[PDat]) AND English[lang])) - 648 result(s).


Index(['PMID', 'OWN', 'STAT', 'LR', 'IS', 'DP', 'TI', 'LID', 'AB', 'CI', 'FAU',
       'AU', 'AD', 'LA', 'PT', 'DEP', 'PL', 'TA', 'JT', 'JID', 'PMC', 'OTO',
       'OT', 'EDAT', 'MHDA', 'CRDT', 'PHST', 'AID', 'PST', 'SO', 'VI', 'IP',
       'PG', 'COIS', 'SB', 'DCOM', 'TT', 'RN', 'MH', 'AUID', 'GR', 'CN', 'EIN',
       'CON', 'CIN', 'UOF', 'EFR', 'IR', 'FIR', 'PMCR'],
      dtype='object')

In [4]:
data = pd.DataFrame(columns=['PMID', 'Title', 'All Authors', 'Abstract', 'Journal', 'Publication Type', 'Publication Year', 'Full Citation', 'PubMed Link'])
data['PMID'] = master_id_list

# go through the master id list and collect information
for i in tqdm(range(len(data['PMID']))):
    
    if df_full_records['FAU'][i] is not np.nan:
        author_list = df_full_records['FAU'][i]
        # return all authors in a string separated by ","
        authors = '; '.join(author_list)
    else:
        first_author, authors, authors_ama = None, None, None
        
    #  Title and Abstract body
    abstract = df_full_records['AB'][i]
    if abstract is np.nan:
        abstract = ''
    title = df_full_records['TI'][i]
    if title is np.nan:
        title = ''
         
    
    journal = df_full_records['TA'][i]
    
    # Full citation 
    try:
        cit_chunks = df_full_records['SO'][i].split('.')
        if df_full_records['PST'][i] == 'aheadofprint':
            cit_no_journal = cit_chunks[1] + '. [Epub ahead of print]' 
            full_cit = cit_chunks[0] + '.' +cit_chunks[1] + '. [Epub ahead of print]'
        else:
            cit_no_journal = cit_chunks[1] + '.'
            full_cit = cit_chunks[0] + '.' + cit_chunks[1] + '.'
    except AttributeError:
        cit_no_journal = journal
        full_cit = journal
        
    # Publication type    
    try:   
        publication_type = '\n'.join(df_full_records['PT'][i])
    except TypeError:
        publication_type = ''
        
    # Publication Year
    publication_year = df_full_records['DP'][i].split(' ')[0]
    
    # PubMed Link
    pubmed_link = 'https://www.ncbi.nlm.nih.gov/pubmed/' + master_id_list[i] + '?'
    # assign value to the dataframe
    data.at[i, ['Title', 'All Authors', 'Abstract', 'Journal', 'Publication Type', 'Publication Year', 
                'Full Citation', 'PubMed Link']] = [title, authors, abstract, journal, publication_type, publication_year, full_cit, pubmed_link]

100%|██████████| 648/648 [00:00<00:00, 833.78it/s]


In [5]:
literature_grid = data

In [6]:
literature_grid.head()

Unnamed: 0,PMID,Title,All Authors,Abstract,Journal,Publication Type,Publication Year,Full Citation,PubMed Link
0,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",An outbreak of severe acute respiratory syndro...,Engineering (Beijing),Journal Article,2020,Engineering (Beijing). 2020 Mar 18. [Epub ahea...,https://www.ncbi.nlm.nih.gov/pubmed/32346491?
1,32341630,Tackling Corona Virus Disease 2019 (COVID 19) ...,"Ramesh, Naveen; Siddaiah, Archana; Joseph, Bobby",Coronaviruses are zoonotic viruses and six spe...,Indian J Occup Environ Med,Journal Article\nReview,2020,Indian J Occup Environ Med. 2020 Jan-Apr;24(1)...,https://www.ncbi.nlm.nih.gov/pubmed/32341630?
2,32339137,Mental health and a novel coronavirus (2019-nC...,"Zheng, Wei",,J Affect Disord,Letter,2020,J Affect Disord. 2020 May 15;269:201-202.,https://www.ncbi.nlm.nih.gov/pubmed/32339137?
3,32337113,Brief Review on COVID-19: The 2020 Pandemic Ca...,"Valencia, Damian N",Severe acute respiratory syndrome coronavirus ...,Cureus,Journal Article\nReview,2020,Cureus. 2020 Mar 24;12(3):e7386.,https://www.ncbi.nlm.nih.gov/pubmed/32337113?
4,32336558,"CRT 2020, COVID-19 and beyond.","Waksman, Ron",,Cardiovasc Revasc Med,Journal Article,2020,Cardiovasc Revasc Med. 2020 Mar 19. [Epub ahea...,https://www.ncbi.nlm.nih.gov/pubmed/32336558?


In [7]:
data = pd.DataFrame(columns = ['PMID', 'Title', 'All Authors', 'Sentence', 'Keywords'])

In [8]:
def clean_up(text):
    clean_up_list = ['RESEARCH',  'AND', 'PURPOSE: ', 'METHODS: ', 'METHOD:', 'RESULTS: ', 'CONCLUSIONS: ',
                     'INTERPRETATION: ', 'OBJECTIVES:', 'RATIONALE: ', 'RECENT FINDINGS: '
                     'OBJECTIVE', 'RESULTS', 'INTRODUCTION', 'RECENT FINDING', 'SUMMARY: '
                     'METHODS', 'CONCLUSIONS', '\n', 'BACKGROUND','Purpose:', 'PURPOSE: ','PURPOSE',
                     'Methods:', 'Conclusion:', 'Background:','Results:', 'DESIGN']
    for stopwords in clean_up_list:
        text = str(text).replace(stopwords, '')
    try:
        index = text.index('.')
        if index<len(text)-1:
            if text[index+1].isupper():
                text = text[:index] + '. ' + text[index+1:]
        else:
            pass
    except ValueError:
        pass
    return text

def sent_tokenize(text):
    sentence_list = text.split('. ')
    sentence_list_temp = []
    for sentence in sentence_list:
        if len(sentence)>1:
            if sentence[-1] == '.':
                sentence_list_temp.append(sentence)
            else:
                sentence_list_temp.append(sentence+'.')
    return sentence_list_temp

def first_term_all_cap(sentence):
    all_cap = True
    for char in sentence.split(' ')[0]:
        if not char.isupper():
            all_cap = False
            break
    return all_cap

def first_char_cap(sentence):
    first_char_cap = True
    for term in sentence.split(' '):
        if not term[0].isupper():
            first_char_cap = False
            break
    return first_char_cap

def all_cap(sentence):
    all_cap = True
    for term in sentence.split(' '):
        for char in term:
            if not char.isupper():
                all_cap = False
                break
    return all_cap

def clean_up_sentence(sentence):
    while sentence[0] == ' ':
        sentence = sentence[1:]
    sentence = sentence.replace('  ', '')
    term_list = sentence.split(' ')
    if not all_cap(sentence):
        if first_char_cap(sentence):
            term_list_after = []
            for term in term_list:
                if all_cap(term):
                    term_list_after.append(term)
                else:
                    term_list_after.append(term[0].lower()+term[1:])    
            sentence = ' '.join(term_list_after) 
        else:
            term_list_after = []
            for term in term_list:
                if all_cap(term.replace(':','')):
                    term_list_after.append(term)
                else:
                    term_list_after.append(term[0].lower()+term[1:])    
            sentence = ' '.join(term_list_after)  
    return sentence

In [9]:
for i in tqdm(range(len(literature_grid))):
    data_temp = pd.DataFrame(columns = ['PMID', 'Title', 'All Authors', 'Sentence', 'Keywords', 'Key Terms'])
    if str(literature_grid.iloc[i]['Abstract']) == '':
        text = clean_up(str(literature_grid.iloc[i]['Title']))
    else:
        text = clean_up(str(literature_grid.iloc[i]['Title']) + ' ' + str(literature_grid.iloc[i]['Abstract']))
    sent_text = sent_tokenize(text)
    data_temp['Sentence'] = sent_text
    data_temp['PMID'] = literature_grid['PMID'][i]
    data_temp['Title'] = literature_grid['Title'][i]
    data_temp['All Authors'] = literature_grid['All Authors'][i]
    data = data.append(data_temp,sort=False)

100%|██████████| 648/648 [00:04<00:00, 151.27it/s]


In [10]:
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1\
    import Features, SentimentOptions, KeywordsOptions

In [11]:
# nYmrbPnNeBRnBullFI6KBqqJsrY9jCNvv1Dv0QBuuSQY #apothecom.NLP
# 2H5wZ6Ghn2VBsAUUcfKOsJVIQz1zX3biAj8S9ozMlLZX
authenticator = IAMAuthenticator('nYmrbPnNeBRnBullFI6KBqqJsrY9jCNvv1Dv0QBuuSQY')
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    authenticator=authenticator
)

natural_language_understanding.set_service_url('https://gateway.watsonplatform.net/natural-language-understanding/api')

In [12]:
data = data.reset_index(drop=True)
print('There are', len(data), 'sentences')
data.head(10)

There are 2759 sentences


Unnamed: 0,PMID,Title,All Authors,Sentence,Keywords,Key Terms
0,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",Experimental Treatment with Favipiravir for CO...,,
1,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",An outbreak of severe acute respiratory syndro...,,
2,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",More than 16% of patients developed acute resp...,,
3,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",No specific treatment has been reported.,,
4,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...","Herein, we examine the effects of Favipiravir ...",,
5,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",Patients with laboratory-confirmed COVID-19 wh...,,
6,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...","Changes in chest computed tomography (CT), vir...",,
7,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",For the 35 patients enrolled in the FPV arm an...,,
8,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",A shorter viral clearance time was found for t...,,
9,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",The FPV arm also showed significant improvemen...,,


In [13]:
for i in tqdm(range(len(data))):
    text = clean_up_sentence(data.iloc[i]['Sentence'])
    try:
        if len(text) > 20:
            response = natural_language_understanding.analyze(
                      text=text,
                      features=Features(keywords=KeywordsOptions())).get_result()
            keywords = '||'.join([keyword['text'] for keyword in response['keywords']])
            term_list = []
            for kt in [keyword['text'] for keyword in response['keywords']]:
                kt_list = kt.split(' ')
                term_list = term_list + kt_list
            terms = '||'.join(term_list)
            data.at[i,['Keywords', 'Key Terms']] = keywords, terms
    except:
        pass

 20%|██        | 557/2759 [07:25<31:28,  1.17it/s]  ERROR:root:Error in service call
Traceback (most recent call last):
  File "C:\Users\ehu\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 384, in _make_request
    six.raise_from(e, None)
  File "<string>", line 2, in raise_from
  File "C:\Users\ehu\AppData\Local\Continuum\anaconda3\lib\site-packages\urllib3\connectionpool.py", line 380, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Users\ehu\AppData\Local\Continuum\anaconda3\lib\http\client.py", line 1336, in getresponse
    response.begin()
  File "C:\Users\ehu\AppData\Local\Continuum\anaconda3\lib\http\client.py", line 306, in begin
    version, status, reason = self._read_status()
  File "C:\Users\ehu\AppData\Local\Continuum\anaconda3\lib\http\client.py", line 267, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Users\ehu\AppData\Local\Continuum\anaconda3\lib\socket.py", line 589, i

In [14]:
data.head()

Unnamed: 0,PMID,Title,All Authors,Sentence,Keywords,Key Terms
0,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",Experimental Treatment with Favipiravir for CO...,experimental treatment||open-Label control stu...,experimental||treatment||open-Label||control||...
1,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",An outbreak of severe acute respiratory syndro...,SARS-CoV-2||coronavirus disease||infection||ou...,SARS-CoV-2||coronavirus||disease||infection||o...
2,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",More than 16% of patients developed acute resp...,acute respiratory distress syndrome||patients|...,acute||respiratory||distress||syndrome||patien...
3,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...",No specific treatment has been reported.,specific treatment,specific||treatment
4,32346491,Experimental Treatment with Favipiravir for CO...,"Cai, Qingxian; Yang, Minghui; Liu, Dongjing; C...","Herein, we examine the effects of Favipiravir ...",effects of favipiravir||FPV||treatment of cOVI...,effects||of||favipiravir||FPV||treatment||of||...


In [15]:
key_term_frequency = {}
for i in range(len(data)):
    for key_term in str(data['Key Terms'][i]).split('||'):
        if key_term in key_term_frequency:
            key_term_frequency[key_term] += 1
        else:
            key_term_frequency[key_term] = 1
            
key_term_frequency_df = pd.DataFrame(key_term_frequency, index=[1]).T

In [16]:
keyword_frequency = {}
for i in range(len(data)):
    for keyword in str(data['Keywords'][i]).split('||'):
        if keyword in keyword_frequency:
            keyword_frequency[keyword] += 1
        else:
            keyword_frequency[keyword] = 1
            
keyword_frequency_df = pd.DataFrame(keyword_frequency, index=[1]).T

In [17]:
writer = pd.ExcelWriter(r'C:\Users\ehu\repos\dna\NLP Projects\COVID NLP.xlsx', engine='xlsxwriter')
data.to_excel(writer, sheet_name='Grid', index=False)
keyword_frequency_df.to_excel(writer, sheet_name='Keywords')
key_term_frequency_df.to_excel(writer, sheet_name='Key terms')

In [18]:
writer.save()# Finished - save
writer.close()# Finished - close