In [1]:
from Bio import Entrez
import pandas as pd
import numpy as np
from urllib.request import urlopen, Request
from urllib.parse import urlencode

import json
from pprint import pprint

In [19]:
# helper functions from https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/

def search(query):
    Entrez.email = 'youremail@example.com'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='750', # the number of papers you want to return 
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    handle.close()
    return results


def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'jacobdodd94@gmail.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    handle.close()
    return results

In [30]:


#batch1 terms
#search_terms = ['fever','diabetes type 2','pancratic cancer',"alzheimers",'lupus',"parkinsons",'dementia']

#batch 2 terms
#search_terms = ['breast cancer','high blood pressure','heart disease',"lymphoma",'clinical depression',"bipolar disorder",'arthritis']

#batch 3 search terms 
#search_terms = ['pancreatic cancer','lime disease','amyotrophic lateral sclerosis',"multiple scelrosis",'obesity',"acquired immunodeficiency syndrome",'thyroid cancer','kidney failure']

#batch 4 terms
#search_terms = ["multiple sclerosis","lung cancer",'melanoma','Non-melanoma skin cancer','prostate cancer','Colorectal cancer','non-hodgkin lymphoma']

#batch 5 terms

#search_terms = ['insomnia','anxiety disorder','dystonia','chronic lower respitory disease','influenza','stroke','hepatitis b']

#batch 6 terms
#search_terms = ['obsessive-compulsive disorder','post-traumatic stress disorder','concussion','bulimia nervosa','anorexia nervosa','attention deficit hyperactivity disorder']

batch = 'all_1'

#all batch 1 terms
search_terms = ['fever','diabetes type 2','pancratic cancer',"alzheimers",'lupus',"parkinsons",'dementia',
                'breast cancer','high blood pressure','heart disease',"lymphoma",'clinical depression',
                "bipolar disorder",'arthritis','pancreatic cancer','lime disease','amyotrophic lateral sclerosis',
                "multiple scelrosis",'obesity',"acquired immunodeficiency syndrome"]

#all batch 2 terms
search_terms = ['thyroid cancer','kidney failure',
               "multiple sclerosis","lung cancer",'melanoma','Non-melanoma skin cancer','prostate cancer','Colorectal cancer',
                'non-hodgkin lymphoma','insomnia','anxiety disorder','dystonia','chronic lower respitory disease','influenza','stroke','hepatitis b']

#create final dataframe for all search terms
export_df = pd.DataFrame(columns = ['med_type','keywords','Date','language','journal_title','ISSN','Article_title','abstract'])


for i in range(len(search_terms)):
    
    search_df = pd.DataFrame( columns = ['med_type','keywords','Date','language','journal_title','ISSN','Article_title','abstract'])

    results = search(search_terms[i])
    
    disease = search_terms[i]
    
    #handle empty results
    try:
        id_list = results['IdList']
        papers = fetch_details(id_list)
    except RuntimeError:
        print("no results found for search term: {}".format(disease))
        continue
    
    #get number of papers found
    n_papers = len(papers['PubmedArticle'])
    
    print("Search found {} results".format(n_papers))
    print('')
    
    for j in range(n_papers):
        
        #create empty dataframe for single paper
        paper_df = pd.DataFrame()
        
        #get type of medication/treatment/chemical used
        try:
            med_type = str(papers['PubmedArticle'][j]['MedlineCitation']['ChemicalList'][0]['NameOfSubstance'])
        except KeyError:
            med_type = ''
        
        #get keywords
        keywords = []
        try:
            for k in range(len(papers['PubmedArticle'][j]['MedlineCitation']['KeywordList'][0])):
                keyword = "".join(char for char in str(papers['PubmedArticle'][j]['MedlineCitation']['KeywordList'][0][k]))
                keywords.append(keyword)
            clean_keywords = ",".join(char for char in keywords)
        except IndexError:
            clean_keywords = ''
            
        #get date of publish
        try:
            year = papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Year']
            month = papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Month']
            day = papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Day']
        except IndexError:
            try:
                year = papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
                month = papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month']
                day = None
            except KeyError:
                year = None
                month = None
                day = None
        
        #clean and combine dates
        try:
            date = str(year + '/' + month + '/' + day)
            #print(date)
        except TypeError:
            try:
                date = str(year + '/' + month + '/' + '1')
            except TypeError:
                date = None
            #print(date)
        
        #get paper language
        lang = str(papers['PubmedArticle'][j]['MedlineCitation']['Article']['Language'])

        # Get Journal title
        journal_title = str(papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['Title'])
        #print(journal_title)

        # Get ISSN ID for Journal
        try:
            ISSN = str(papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['ISSN'])
        except KeyError:
            ISSN = ''

        # Get title
        title = "".join(char for char in list(papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleTitle']))

        # get full abstract
        try:
            abstract = "".join(line for line in papers['PubmedArticle'][j]['MedlineCitation']['Article']['Abstract']['AbstractText'])
        except KeyError:
            abstract = ''
        
        # bring everything in dataframe
        paper_df['med_type'] = pd.Series(med_type)
        paper_df['keywords'] = pd.Series(clean_keywords)
        paper_df['Date'] = pd.Series(date)
        paper_df['language'] = pd.Series(lang)
        paper_df['journal_title'] = pd.Series(journal_title)
        paper_df['ISSN'] = pd.Series(ISSN)
        paper_df['Article_title'] = pd.Series(title)
        paper_df['abstract'] = pd.Series(abstract)
        paper_df['disease'] = pd.Series(disease)

        #add into existing dataframe for search term
        search_df = pd.concat([search_df, paper_df],ignore_index=True)
        
        #printing out progress for every 10% of terms
        if j % (n_papers *.10) == 0:
            print("Article #{} processed".format(j))
            print('---------')
        elif j == (n_papers - 1):
            print("Article #{} processed".format(j))
            print("All articles for {} term processed.".format(disease))
            print('---------')
            
    #adding to final search dataframe
    export_df = pd.concat([export_df,search_df],ignore_index=True)
    
print("All terms processed.")        

export_df.to_csv("pubmed_abstracts_batch{}.csv".format(batch))

    #print(search_df.head())


Search found 750 results

Article #0 processed
---------
Article #75 processed
---------
Article #150 processed
---------
Article #225 processed
---------
Article #300 processed
---------
Article #375 processed
---------
Article #450 processed
---------
Article #525 processed
---------
Article #600 processed
---------
Article #675 processed
---------
Article #749 processed
All articles for thyroid cancer term processed.
---------
Search found 750 results

Article #0 processed
---------
Article #75 processed
---------
Article #150 processed
---------
Article #225 processed
---------
Article #300 processed
---------
Article #375 processed
---------
Article #450 processed
---------
Article #525 processed
---------
Article #600 processed
---------
Article #675 processed
---------
Article #749 processed
All articles for kidney failure term processed.
---------
Search found 750 results

Article #0 processed
---------
Article #75 processed
---------
Article #150 processed
---------
Article #2

In [31]:
export_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11249 entries, 0 to 11248
Data columns (total 9 columns):
Article_title    11249 non-null object
Date             10873 non-null object
ISSN             11249 non-null object
abstract         11249 non-null object
disease          11249 non-null object
journal_title    11249 non-null object
keywords         11249 non-null object
language         11249 non-null object
med_type         11249 non-null object
dtypes: object(9)
memory usage: 791.0+ KB
