### The code is used to scrap data for papers in the biomedical field from National Institute for Health (NIH) API named PubMed.

In [1]:
from Bio import Entrez
from Bio.Entrez import efetch
import pandas as pd
import numpy as np

In [2]:
def print_abstract(pmid):
    """For a given PMID number, print the abstract for the paper"""
    handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
    print (handle.read())

In [3]:
def search(query):
    """define a function for performing the search
    The list of citation IDs will be available as results[‘IdList’]."""
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='20',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [4]:
def create_idlist(n):
    """Return a list of PMID numbers with size n. The earliest paper PMID is 28785052 published on August 2017"""
    idlist = []
    for i in range(27000000-n, 27000000):
        idlist.append(str(i))
    return idlist

In [5]:
def fetch_details(id_list):
    """A function that gets a list of PMIDs and returns a xml of papers' information"""
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [6]:
def get_title(paper):
    """Given a xml paper info, this function returns the paper's title"""
    return paper['MedlineCitation']['Article']['ArticleTitle']   

In [7]:
def get_abstract(paper):
    """Given a xml paper info, this function returns the paper's abstract"""
    try:
        return paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
    except:
        return np.nan

In [8]:
def get_year(paper):
    """Given a xml paper info, this function returns the paper's year of publication"""    
    try:
        return paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
    except:
        return np.nan

In [9]:
def get_journal(paper):
    """Given a xml paper info, this function returns the paper's journal name"""    
    try:
        return paper['MedlineCitation']['Article']['Journal']['Title']
    except:
        return np.nan

In [10]:
def get_citations(paper):
    """Given a xml paper info, this function returns a list containing all the PMIDs for paper's citations"""    
    citations = []
    try:
        for citation in paper['MedlineCitation']['CommentsCorrectionsList']:
            citations.append(str(citation['PMID']))
    except:
        citations = np.nan
    return citations

In [11]:
def get_tags(paper):
    """Given a xml paper info, this function returns a list containing all tags for the paper"""    
    tags_list = []
    try:
        for tag in paper['MedlineCitation']['MeshHeadingList']:
            tags_list.append(str(tag['DescriptorName']))
    except:
        tags_list = np.nan
    return tags_list

In [12]:
def get_authors(paper):
    """Given a xml paper info, this function returns a list containing last names of paper's authors"""
    authors_list = []
    try:
        for auth in paper['MedlineCitation']['Article']['AuthorList']:
            authors_list.append(str(auth['LastName']))
    except:
        authors_list = np.nan
    return authors_list

In [13]:
def get_paper_info(paper, id):
    """Given paper's xml and PMID, it returns a tuple containing infomration about the paper"""
    
    title = get_title(paper)
    authors = get_authors(paper)
    tags = get_tags(paper)
    citations = get_citations(paper)
    year = get_year(paper)
    abstract = get_abstract(paper)
    journal = get_journal(paper)
    return (id, title, authors, year, journal, abstract, tags, citations)

In [14]:
if __name__ == '__main__':
    id_list = create_idlist(10000)
    print(len(id_list))
    try:
        papers = fetch_details(id_list)
    except:
        pass
    
    paper_list = []

    for i, paper in enumerate(papers['PubmedArticle']):
#        if i==20:
#            print (paper)
        paper_list.append(get_paper_info(paper, id_list[i]))

    df = pd.DataFrame(paper_list, columns=['id', 'title', 'authors', 'year', 'journal', 'abstract', 'tags', 'citations'])

10000


Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


In [15]:
print(len(paper_list))
df.head(10)

9632


Unnamed: 0,id,title,authors,year,journal,abstract,tags,citations
0,26990000,Conjugation of Uridine with Oleanolic Acid Der...,"[Cheng, Su, Huang, Liu, Zheng, Chen]",2016,Chemical biology & drug design,According to fused two bioactive moieties toge...,"[Antineoplastic Agents, Apoptosis, Caspase 3, ...",
1,26990001,Atrial Fibrillation Predicts Cardiovascular Ou...,"[Vrsalovic, Vucur, Jelakovic]",2016,"Journal of clinical hypertension (Greenwich, C...",,"[Atrial Fibrillation, Heart Failure, Humans, H...",
2,26990002,Vav1 Regulates Mesenchymal Stem Cell Different...,"[Qu, Wang, Min, McKennett, Keller, Lin]",2016,"Stem cells (Dayton, Ohio)",Mesenchymal stem cells (MSCs) are multipotent ...,"[Adipocytes, Adipogenesis, Adiposity, Animals,...",
3,26990003,Treatment of Mental or Physical Health Problem...,"[Conway, Schmied, Larson, Galarneau, Hammer, Q...",2016,Journal of traumatic stress,The primary aim of this study was to evaluate ...,"[Adult, Combat Disorders, Databases, Factual, ...",
4,26990004,Sex Work Regulation and Sexually Transmitted I...,"[Quast, Gonzalez]",2017,Health economics,While reducing the transmission of sexually tr...,"[Adolescent, Adult, Age Factors, Aged, Female,...",
5,26990005,How to manage the logistics of biological ther...,[Headon],2016,Journal of gastroenterology and hepatology,,"[Biological Factors, Delivery of Health Care, ...",
6,26990006,Hardware complications and failure of three-un...,"[Shi, Zhang, Qiao, Qian, Mo, Lai]",2017,Clinical oral implants research,The aim of the present study was to assess the...,"[Adult, Aged, Aged, 80 and over, Dental Porcel...",
7,26990007,Impact of human milk pasteurization on the kin...,"[Deglaire, De Oliveira, Jardin, Briard-Bion, E...",2016,Electrophoresis,"Holder pasteurization (62.5°C, 30 min) ensures...","[Chromatography, Liquid, Digestion, Humans, In...",
8,26990008,Attentional guidance by relative features: Beh...,"[Schönhammer, Grubert, Kerzel, Becker]",2016,Psychophysiology,Our ability to select task-relevant informatio...,"[Adult, Attention, Cues, Electroencephalograph...",
9,26990009,Identifying Older Adults with Serious Illness:...,"[Kelley, Covinsky, Gorges, McKendrick, Bollens...",2017,Health services research,"To create and test three prospective, increasi...","[Activities of Daily Living, Aged, Aged, 80 an...","[15493448, 17187548, 23838378, 9441588, 198285..."


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9632 entries, 0 to 9631
Data columns (total 8 columns):
id           9632 non-null object
title        9632 non-null object
authors      9414 non-null object
year         9051 non-null object
journal      9632 non-null object
abstract     8342 non-null object
tags         7690 non-null object
citations    3673 non-null object
dtypes: object(8)
memory usage: 602.1+ KB


In [17]:
df.dropna(axis=0, subset=['title','tags'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7690 entries, 0 to 9631
Data columns (total 8 columns):
id           7690 non-null object
title        7690 non-null object
authors      7554 non-null object
year         7316 non-null object
journal      7690 non-null object
abstract     6812 non-null object
tags         7690 non-null object
citations    2656 non-null object
dtypes: object(8)
memory usage: 540.7+ KB


In [18]:
df.to_csv('./data.csv', index=False, header=True, sep='\t')