In [1]:
from Bio import Entrez
from Bio.Entrez import efetch
import pandas as pd
import numpy as np

In [2]:
def print_abstract(pmid):
    """For a given PMID number, print the abstract for the paper"""
    handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
    print (handle.read())

In [3]:
#print_abstract(20000249)

In [4]:
def search(query):
    """define a function for performing the search
    The list of citation IDs will be available as results[‘IdList’]."""
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='20',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

In [5]:
def create_idlist(n):
    """Return a list of PMID numbers with size n. The earliest paper PMID is 28785052 published on August 2017"""
    idlist = []
    for i in range(28000000-n, 28000000):
        idlist.append(str(i))
    return idlist

In [6]:
def fetch_details(id_list):
    """A function that gets a list of PMIDs and returns a xml of papers' information"""
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [7]:
def get_title(paper):
    """Given a xml paper info, this function returns the paper's title"""
    return paper['MedlineCitation']['Article']['ArticleTitle']   

In [8]:
def get_abstract(paper):
    """Given a xml paper info, this function returns the paper's abstract"""
    try:
        return paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
    except:
        return np.nan

In [9]:
def get_year(paper):
    """Given a xml paper info, this function returns the paper's year of publication"""    
    try:
        return paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year']
    except:
        return np.nan

In [10]:
def get_journal(paper):
    """Given a xml paper info, this function returns the paper's journal name"""    
    try:
        return paper['MedlineCitation']['Article']['Journal']['Title']
    except:
        return np.nan

In [11]:
def get_citations(paper):
    """Given a xml paper info, this function returns a list containing all the PMIDs for paper's citations"""    
    citations = []
    try:
        for citation in paper['MedlineCitation']['CommentsCorrectionsList']:
            citations.append(str(citation['PMID']))
    except:
        citations = np.nan
    return citations

In [12]:
def get_tags(paper):
    """Given a xml paper info, this function returns a list containing all tags for the paper"""    
    # mesh_terms_raw
    tags_list = []
    try:
        for tag in paper['MedlineCitation']['MeshHeadingList']:
            tags_list.append(tag['DescriptorName'])
    except:
        tags_list = np.nan
#    return "; ".join(tags_list)
    return tags_list

In [13]:
def get_authors(paper):
    """Given a xml paper info, this function returns a list containing last names of paper's authors"""
    # mesh_terms_raw
    authors_list = []
    try:
        for auth in paper['MedlineCitation']['Article']['AuthorList']:
        #print (auth)
            authors_list.append(auth['LastName'])
    except:
        authors_list = np.nan
#    return "; ".join(tags_list)
    return authors_list

In [14]:
def get_paper_info(paper, id):
    """Given paper's xml and PMID, it returns a tuple containing infomration about the paper"""
    
    title = get_title(paper)
    authors = get_authors(paper)
    tags = get_tags(paper)
    citations = get_citations(paper)
    year = get_year(paper)
    abstract = get_abstract(paper)
    journal = get_journal(paper)
    return (id, title, authors, year, journal, abstract, tags, citations)

In [15]:
if __name__ == '__main__':
#     results = search('fever')
#     id_list = results['IdList']
    id_list = create_idlist(10000)
    print(len(id_list))
    try:
        papers = fetch_details(id_list)
    except:
        pass
    
    paper_list = []
#     print(id_list)
    for i, paper in enumerate(papers['PubmedArticle']):
        if i==20:
            print (paper)
        paper_list.append(get_paper_info(paper, id_list[i]))
#           print(paper)
#         print("%d) %s" % (i+1, get_title(paper)))
#         print("%d) %s" % (i+1, get_tags(paper)))
#         print("%d) %s" % (i+1, get_citations(paper)))
#         print("%d) %s" % (i+1, get_year(paper)))
#         print("%d) %s" % (i+1, get_abstract(paper)))
#         print("%d) %s" % (i+1, get_journal(paper)))
#         print('-------------------------')
    df = pd.DataFrame(paper_list, columns=['id', 'title', 'authors', 'year', 'journal', 'abstract', 'tags', 'citations'])
    # Pretty print the first paper in full to observe its structure
    #import json
    #print(json.dumps(papers[0], indent=2, separators=(',', ':')))

10000


Email address is not specified.

To make use of NCBI's E-utilities, NCBI requires you to specify your
email address with each request.  As an example, if your email address
is A.N.Other@example.com, you can specify it as follows:
   from Bio import Entrez
   Entrez.email = 'A.N.Other@example.com'
In case of excessive usage of the E-utilities, NCBI will attempt to contact
a user at the email address provided before blocking access to the
E-utilities.


{'MedlineCitation': DictElement({'CitationSubset': [], 'OtherID': [], 'OtherAbstract': [], 'KeywordList': [], 'SpaceFlightMission': [], 'GeneralNote': [], 'PMID': StringElement('27990020', attributes={'Version': '1'}), 'DateRevised': {'Year': '2017', 'Month': '02', 'Day': '20'}, 'Article': DictElement({'ELocationID': [StringElement('10.1038/nrg.2016.163', attributes={'EIdType': 'doi', 'ValidYN': 'Y'})], 'Language': ['eng'], 'ArticleDate': [DictElement({'Year': '2016', 'Month': '12', 'Day': '19'}, attributes={'DateType': 'Electronic'})], 'Journal': {'ISSN': StringElement('1471-0064', attributes={'IssnType': 'Electronic'}), 'JournalIssue': DictElement({'Volume': '18', 'Issue': '2', 'PubDate': {'Year': '2017', 'Month': 'Feb'}}, attributes={'CitedMedium': 'Internet'}), 'Title': 'Nature reviews. Genetics', 'ISOAbbreviation': 'Nat. Rev. Genet.'}, 'ArticleTitle': 'Genetic engineering: CREATE-ing genome-wide designed mutations.', 'Pagination': {'MedlinePgn': '69'}, 'AuthorList': ListElement([D

In [16]:
print(len(paper_list))
df.head(1000)

8765


Unnamed: 0,id,title,authors,year,journal,abstract,tags,citations
0,27990000,Phenotypic difference between Δ(srl-recA)306 a...,"[Suzuki, Kaidow, Meya, Masuya, Shiina]",2017,The Journal of general and applied microbiology,Many significant gene mutations in E. coli hav...,"[DNA Transposable Elements, Escherichia coli, ...",
1,27990001,Archaeal histone distribution is associated wi...,"[Nishida, Oshima]",2017,The Journal of general and applied microbiology,"A subpopulation of Archaea possesses histones,...","[Archaea, Base Composition, DNA, Archaeal, Evo...",
2,27990002,Cinepazide Maleate Improves Cognitive Function...,"[Li, Zhang, Zhang, Zou, Gong, Fu]",2017,Biological & pharmaceutical bulletin,To determine the combined effect of type 2 dia...,"[Amyloid Precursor Protein Secretases, Animals...",
3,27990003,Current status of genome editing in vector mos...,"[Reegan, Ceasar, Paulraj, Ignacimuthu, Al-Dhabi]",2017,Bioscience trends,Mosquitoes pose a major threat to human health...,"[Animals, Culicidae, Gene Editing, Humans, Ins...",
4,27990004,Endoscopic and surgical ampullectomy for non-i...,"[Dubois, Labgaa, Dorta, Halkic]",2017,Bioscience trends,"Non-invasive ampullary tumors, may be treated ...","[Algorithms, Cell Proliferation, Duodenal Neop...",
5,27990005,Ledipasvir and sofosbuvir for recurrent hepati...,"[Oya, Sugawara, Watanabe, Yoshimaru, Honda, Ha...",2017,Bioscience trends,Management of recurrent hepatitis C following ...,"[Antiviral Agents, Benzimidazoles, Female, Flu...",
6,27990006,Nitric oxide donor hybrid compounds as promisi...,"[Ding, Zang, Gao, Gao, Duan, Li, Xu, Zhang]",2017,Drug discoveries & therapeutics,Nitric oxide (NO) plays important roles in car...,,
7,27990007,A newborn with hemorrhagic meningoencephalitis...,"[Coskun, Akman, Yildirim, Demir]",2017,Drug discoveries & therapeutics,Neonatal meningoencephalitis is a severe condi...,,
8,27990008,Anemia Treatment by Erythropoiesis-stimulating...,"[Yoshida, Hayashi]",2017,The Keio journal of medicine,Anemia in chronic kidney disease (CKD) is a ri...,"[Aged, Aged, 80 and over, Anemia, Darbepoetin ...",
9,27990009,Distribution of methicillin-resistant coagulas...,"[Fungwithaya, Brikshavana, Chanchaithong, Prap...",2017,The Journal of veterinary medical science,This study aimed to investigate the spread of ...,"[Animal Technicians, Animals, Cat Diseases, Ca...","[25196800, 22657930, 2275856, 25008316, 225155..."


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8765 entries, 0 to 8764
Data columns (total 8 columns):
id           8765 non-null object
title        8765 non-null object
authors      8563 non-null object
year         8303 non-null object
journal      8765 non-null object
abstract     7509 non-null object
tags         5251 non-null object
citations    4345 non-null object
dtypes: object(8)
memory usage: 547.9+ KB


In [18]:
df.to_csv('./data.csv', index=False, header=True, sep='\t')

In [24]:
print(df.tags[0])

[StringElement('DNA Transposable Elements', attributes={'UI': 'D004251', 'MajorTopicYN': 'N'}), StringElement('Escherichia coli', attributes={'UI': 'D004926', 'MajorTopicYN': 'N'}), StringElement('Gene Deletion', attributes={'UI': 'D017353', 'MajorTopicYN': 'Y'}), StringElement('Genes, Bacterial', attributes={'UI': 'D005798', 'MajorTopicYN': 'N'}), StringElement('Genotype', attributes={'UI': 'D005838', 'MajorTopicYN': 'N'}), StringElement('High-Throughput Nucleotide Sequencing', attributes={'UI': 'D059014', 'MajorTopicYN': 'N'}), StringElement('Mutagenesis, Insertional', attributes={'UI': 'D016254', 'MajorTopicYN': 'Y'}), StringElement('Polymerase Chain Reaction', attributes={'UI': 'D016133', 'MajorTopicYN': 'N'}), StringElement('Rec A Recombinases', attributes={'UI': 'D011938', 'MajorTopicYN': 'N'})]
