In [41]:
def search_and_export_articles(terms=None,export_suffix=None,max_articles=1000,export=True):
    
    '''
    Scrapes Pubmed for numerous attributes and returns the result as a dataframe,
    exporting it as a csv unless noted. Uses Biopython's Entrez Function to send requests to Pubmed.
    
    
    Takes in the following arguments:
    
    - terms: a list or string object. The term(s) to search pubmed for.
    
    - export_suffix: optional. A term that allows you to control what you want
    the exported file to have at the end. The default naming structure is 
    pubmed_abstracts_export_{}_{}.csv, where the first bracket is the suffix 
    and the second is the date of export.
    
    - max_articles: an integer, default is 1000. The max number of articles to return when searching pubmed.
    
    - export: boolean, whether or not you want the resulting dataframe exported as a csv. Default is True. 
    
    Credit for the search and fetch_details helper functions, which wrapped the Biopython Entrez function 
    in a clean wrapper, goes to Marco Bonzani, with this blogpost:
    https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/
    
    
    Currently the following article attributes are exported from pubmed:
    
    - PMID: Pubmed Article ID
    - med_type: type of medication/treatment used in the study
    - keywords: relevant keywords for article searching
    - date: date published, and if absent, completed
    - language: language of paper. 'eng' == English
    - journal_title: title of the publishing journal
    - ISSN: Identifier for the publishing journal
    - article_title: article title
    - abstract: article abstract
    - authors: authors of paper, separated by ';'
    
    Relevant Websites:
    Pubmed: https://www.ncbi.nlm.nih.gov/pubmed/
    
    This funciton has the following dependencies:
    
    - Pandas
    - Biopython
    - Numpy
    - urllib
    - json
    - datetime
    
    '''
    
    from Bio import Entrez
    from Bio import Entrez
    import pandas as pd
    import numpy as np
    from urllib.request import urlopen, Request
    from urllib.parse import urlencode
    import json
    
    from datetime import datetime
    
    if isinstance(terms,str):
        new_list = list()
        new_list.append(terms)
        terms = new_list
    elif terms == None:
        raise ValueError("There must be a search term. Please specify one or more using the terms parameter.")
    elif (isinstance(terms, tuple))  | (isinstance(terms, dict)):
        raise ValueError("Please enter the search terms in a list.")
        
    
    # helper functions from https://marcobonzanini.com/2015/01/12/searching-pubmed-with-python/
    def search(query,max_articles=max_articles):
        Entrez.email = 'youremail@example.com'
        handle = Entrez.esearch(db='pubmed', 
                                sort='relevance', 
                                retmax=max_articles, # the number of papers you want to return 
                                retmode='xml', 
                                term=query)
        results = Entrez.read(handle)
        handle.close()
        return results


    def fetch_details(id_list):
        ids = ','.join(id_list)
        Entrez.email = 'youremail@example.com'
        handle = Entrez.efetch(db='pubmed',
                               retmode='xml',
                               id=ids)
        results = Entrez.read(handle)
        handle.close()
        return results

    attribs = ['PMID','med_type','keywords','date','language','journal_title','ISSN','article_title','abstract','authors']
    
    #create final dataframe for all search terms
    export_df = pd.DataFrame(columns = attribs)


    for i in range(len(terms)):

        search_df = pd.DataFrame( columns = attribs)

        results = search(terms[i],max_articles=max_articles)

        disease = terms[i]

        #handle empty results
        try:
            id_list = results['IdList']
            papers = fetch_details(id_list)
        except RuntimeError:
            print("no results found for search term: {}".format(disease))
            continue

        #get number of papers found
        n_papers = len(papers['PubmedArticle'])

        print("Search found {} results".format(n_papers))
        print('')

        for j in range(n_papers):

            #create empty dataframe for single paper
            paper_df = pd.DataFrame()

            #get type of medication/treatment/chemical used
            try:
                med_type = str(papers['PubmedArticle'][j]['MedlineCitation']['ChemicalList'][0]['NameOfSubstance'])
            except KeyError:
                med_type = ''

            #get keywords
            keywords = []
            try:
                for k in range(len(papers['PubmedArticle'][j]['MedlineCitation']['KeywordList'][0])):
                    keyword = "".join(char for char in str(papers['PubmedArticle'][j]['MedlineCitation']['KeywordList'][0][k]))
                    keywords.append(keyword)
                clean_keywords = ",".join(char for char in keywords)
            except IndexError:
                clean_keywords = ''

            #get date of publish
            try:
                year = papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Year']
                month = papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Month']
                day = papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleDate'][0]['Day']
            except IndexError:
                try:
                    date_comp = papers['PubmedArticle'][j]['MedlineCitation']['DateCompleted'] 
                    year = date_comp['Year']
                    month = date_comp['Month']
                    day = date_comp['Day']
                except KeyError:
                    try:
                        year = papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
                        month = papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month']
                        day = None
                    except KeyError:
                        year = None
                        month = None
                        day = None

            #clean and combine dates
            try:
                date = str(year + '/' + month + '/' + day)
                #print(date)
            except TypeError:
                try:
                    date = str(year + '/' + month + '/' + '1')
                except TypeError:
                    date = None
                #print(date)


            #get authors
            author_names = []
            try:
                authors = papers['PubmedArticle'][j]['MedlineCitation']['Article']['AuthorList'] 
                for i in range(len(authors)):
                    first = authors[i]['ForeName']
                    last = authors[i]['LastName']
                    full_name = first + ' ' + last
                    author_names.append(full_name)

                all_author_names = ";".join(author for author in author_names)

            except KeyError:
                all_author_names = None

            #get paper language
            lang = str(papers['PubmedArticle'][j]['MedlineCitation']['Article']['Language'])

            # Get Journal title
            journal_title = str(papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['Title'])
            #print(journal_title)

            # Get ISSN ID for Journal
            try:
                ISSN = str(papers['PubmedArticle'][j]['MedlineCitation']['Article']['Journal']['ISSN'])
            except KeyError:
                ISSN = ''

            # Get title
            title = "".join(char for char in list(papers['PubmedArticle'][j]['MedlineCitation']['Article']['ArticleTitle']))

            # get full abstract
            try:
                abstract = "".join(line for line in papers['PubmedArticle'][j]['MedlineCitation']['Article']['Abstract']['AbstractText'])
            except KeyError:
                abstract = ''
                
            #get pubmed ID
            pmid = str(papers['PubmedArticle'][j]['MedlineCitation']['PMID'])

            # bring everything in dataframe
            paper_df['PMID'] = pd.Series(pmid)
            paper_df['med_type'] = pd.Series(med_type)
            paper_df['keywords'] = pd.Series(clean_keywords)
            paper_df['date'] = pd.Series(date)
            paper_df['language'] = pd.Series(lang)
            paper_df['journal_title'] = pd.Series(journal_title)
            paper_df['ISSN'] = pd.Series(ISSN)
            paper_df['article_title'] = pd.Series(title)
            paper_df['abstract'] = pd.Series(abstract)
            paper_df['disease'] = pd.Series(disease)
            paper_df['authors'] = pd.Series(all_author_names)

            #add into existing dataframe for search term
            search_df = pd.concat([search_df, paper_df],ignore_index=True)

            #printing out progress for every 10% of terms
            if j % round((n_papers *.25),0) == 0:
                print("Article #{} processed".format(j))
                print('---------')
            elif j == (n_papers - 1):
                print("Article #{} processed".format(j))
                print("All articles for {} term processed.".format(disease))
                print('---------')

        #adding to final search dataframe
        export_df = pd.concat([export_df,search_df],ignore_index=True)

    print("All terms processed.")        
    
    if export:
        export_df.to_csv("pubmed_abstracts_export_{}_{}.csv".format(batch,datetime.today().strftime('%Y-%m-%d')))

    print(export_df.head())
    
    return export_df

In [38]:
#all batch 1 terms
search_terms = ['fever','diabetes type 2','pancreatic cancer',"alzheimers",'lupus',"parkinsons",'dementia',
                'breast cancer','high blood pressure','heart disease',"lymphoma",'clinical depression',
                "bipolar disorder",'arthritis','lime disease','amyotrophic lateral sclerosis',
                'obesity',"acquired immunodeficiency syndrome",'thyroid cancer','kidney failure',
               "multiple sclerosis","lung cancer",'melanoma','Non-melanoma skin cancer','prostate cancer','Colorectal cancer',
                'non-hodgkin lymphoma','insomnia','anxiety disorder','dystonia','chronic lower respitory disease','influenza','stroke','hepatitis b']

In [42]:
test_df = search_and_export_articles(terms=search_terms,export_suffix="all",max_articles=2000)

Search found 2000 results

Article #0 processed
---------
Article #200 processed
---------
Article #400 processed
---------
Article #600 processed
---------
Article #800 processed
---------
Article #1000 processed
---------
Article #1200 processed
---------
Article #1400 processed
---------
Article #1600 processed
---------
Article #1800 processed
---------
Article #1999 processed
All articles for fever term processed.
---------
Search found 2000 results

Article #0 processed
---------
Article #200 processed
---------
Article #400 processed
---------
Article #600 processed
---------
Article #800 processed
---------
Article #1000 processed
---------
Article #1200 processed
---------
Article #1400 processed
---------
Article #1600 processed
---------
Article #1800 processed
---------
Article #1999 processed
All articles for diabetes type 2 term processed.
---------
Search found 2000 results

Article #0 processed
---------
Article #200 processed
---------
Article #400 processed
---------


In [43]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64299 entries, 0 to 64298
Data columns (total 13 columns):
Article_title    64299 non-null object
Date             64202 non-null object
ISSN             64299 non-null object
PMID             64299 non-null object
abstract         64299 non-null object
article_title    0 non-null object
authors          61579 non-null object
date             0 non-null object
disease          64299 non-null object
journal_title    64299 non-null object
keywords         64299 non-null object
language         64299 non-null object
med_type         64299 non-null object
dtypes: object(13)
memory usage: 6.4+ MB


In [26]:
test_df.head()

Unnamed: 0,Article_title,Date,ISSN,PMID,abstract,article_title,authors,date,disease,journal_title,keywords,language,med_type
0,Predictive Factors of Fever After Aneurysmal S...,2018/03/13,1878-8769,29548963,Fever is relatively common and worsens neurolo...,,Yung Ki Park;Hyeong-Joong Yi;Kyu-Sun Choi;Youn...,,fever,World neurosurgery,"Anterior communicating artery aneurysm,Brain i...",['eng'],Antipyretics
1,Tri-phasic fever in dengue fever.,2018/02/08,1758-1133,29419375,Dengue fever is an acute febrile illness with ...,,Pradeepa H D;Sathish B Rao;Ganaraj B;Gopalakri...,,fever,Tropical doctor,"24 hours,Continuous,conventional monitoring,de...",['eng'],
2,"Periconceptional maternal fever, folic acid in...",2017/11/02,1873-2585,29133009,Previous studies have shown an association bet...,,Stephen M Kerr;Samantha E Parker;Allen A Mitch...,,fever,Annals of epidemiology,"Fever,Folic acid,Neural tube defects,Spina bifida",['eng'],Vitamin B Complex
3,Neutropenic Fever.,2018/01/04,1558-1977,29078933,Fever is a common presenting complaint among a...,,Lindsey White;Michael Ybarra,,fever,Hematology/oncology clinics of North America,"Bacterial infection,Neutropenic fever,Risk str...",['eng'],Anti-Bacterial Agents
4,Detection of human parvovirus B19 in serum sam...,2017/09/22,1878-3511,28951104,It has been demonstrated that infection with h...,,Tony Bokalanga Wawina;Olivier Mbaya Tshiani;St...,,fever,International journal of infectious diseases :...,"Democratic Republic of the Congo,IgM antibody,...",['eng'],"Antibodies, Viral"
