In [1]:
# imports
from Bio import Entrez
import pandas as pd

In [2]:
# API_key
Entrez.api_key = "9499c5280176400b132fc318cd718feefd08"
# user_email (optional)
Entrez.email = "alexandrosangelis2@gmail.com"

In [3]:
# Function to retrieve set of UIDs with esearch
def search(query, mindate, maxdate):
    search_results = Entrez.read(
    Entrez.esearch(
        db = "pubmed",
        retmax = retmax, # number of records
        datetype = "pdat", # publication date
        mindate = mindate, # start search date 
        maxdate = maxdate, # end search year
        term = query,
        sort = "Best Match"
         )
    )
    return search_results

In [4]:
# Function to retrieve full Data records with efetch
def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(
        db = 'pubmed',
        retmode = 'xml',
        id = ids
    )
    results = Entrez.read(handle)
    return results

In [5]:
# Function to create a DataFrame
def create_dataframe(id_list):
    
    title_list = []
    abstract_list = []
    doi_list = []

    for i, paper in enumerate (papers['PubmedArticle']):
        # Title
        title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
        # Abstract
        try:
            abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
        except:
            abstract_list.append('No Abstract')
        # Doi
        try:
            doi_list.append(paper['PubmedData']['ArticleIdList'][1])
        except:
            doi_list.append('No Doi')
            
    df = pd.DataFrame(list(zip(doi_list, title_list, abstract_list)), 
                      columns=['DOI', 'Title', 'Abstract'])
    
    return df

In [6]:
# Query for PubMed
query = '(( (biokinetics) OR (kinetics) OR (absorption) OR (pharmacokinetics) OR (toxicokinetics) OR (uptake) OR (biodistribution) OR (distribution) OR (accumulation) OR (bioaccumulation) OR (disposition) OR (clearance) OR (excretion) OR ((radiolabeled) OR (imaging)) OR ((in) AND (vivo)) ) AND ( (nanomaterials) OR (nanomaterial) OR (nanoparticles) OR (nanoparticle) OR (nanoscale) OR (nanotechnology) OR (nanoformulation) OR (nanoemulsion) OR (nanoemulsions) ) AND ( (rats) OR (rat) OR (mouse) OR (mice) OR (monkey) OR (monkeys) OR (pig) OR (pigs) OR (rabbits) OR (rabbit) OR (dogs) OR (dog) (macaques) OR (macaque) OR (chimpanzee) OR (chimpanzees) OR (human) OR (humans) OR (rodent) OR (rodents) OR (guinea-pig) OR (guinea-pigs) OR (hamster) OR (hamsters) ))'

In [7]:
# Search years
start_year = 2004
end_year = 2023
period = end_year - start_year

In [10]:
# Max Number of Records that can be retrieved 
retmax = 9500

In [11]:
# loop over the years to create DataFrame

df_all_1 = pd.DataFrame()

for year in range(end_year, start_year-1, -1):
    maxdate = year
    mindate = year
    
    id_list = search(query, mindate, maxdate)['IdList'] # set of UIDs
    papers = fetch_details(id_list) # papers records
    df1 = create_dataframe(id_list) # built DataFrame
    
    df_all_1 = pd.concat([df_all_1, df1], ignore_index=True)
    
    print("Year:", year)
    print('Number of Papers:', len(id_list))
    print('\n')

Year: 2023
Number of Papers: 4738


Year: 2022
Number of Papers: 6736


Year: 2021
Number of Papers: 8387


Year: 2020
Number of Papers: 9177


Year: 2019
Number of Papers: 9317


Year: 2018
Number of Papers: 8434


Year: 2017
Number of Papers: 7799


Year: 2016
Number of Papers: 7374


Year: 2015
Number of Papers: 6837


Year: 2014
Number of Papers: 6019


Year: 2013
Number of Papers: 5306


Year: 2012
Number of Papers: 4478


Year: 2011
Number of Papers: 3830


Year: 2010
Number of Papers: 3108


Year: 2009
Number of Papers: 2231


Year: 2008
Number of Papers: 1666


Year: 2007
Number of Papers: 1301


Year: 2006
Number of Papers: 994


Year: 2005
Number of Papers: 616


Year: 2004
Number of Papers: 514




In [12]:
# See size of DataFrame and print it
print('Size of DataFrame:', df_all_1.shape)
df_all_1

Size of DataFrame: (98689, 3)


Unnamed: 0,DOI,Title,Abstract
0,10.3390/molecules28196972,Multifunctional Novel Nanoplatform for Effecti...,Photodynamic therapy (PDT) is an effective non...
1,10.3390/molecules28196760,Compatibility of Nucleobases Containing Pt(II)...,The therapeutic advantages of some platinum co...
2,10.3390/nano13192629,Toxic Effects of Copper Fungicides on the Deve...,Copper-based fungicides have been used to cont...
3,10.3390/polym15193969,Recent Advancements and Strategies for Overcom...,Glioblastoma multiforme (GBM) is a highly aggr...
4,10.3390/ijms241915027,Imidazo-Pyrazole-Loaded Palmitic Acid and Poly...,"Neuroblastoma (NB) is a childhood cancer, comm..."
...,...,...,...
98684,10.1016/s0142-9612(03)00593-3,Aligned biodegradable nanofibrous structure: a...,"A unique biodegradable nanofibrous structure, ..."
98685,10.1016/s0142-9612(03)00576-3,Studies on the microspheres comprised of recon...,Microspheres comprised of hydroxyapatite parti...
98686,PMC1223892,Development of a novel fluorogenic proteolytic...,The present study describes the in vivo detect...
98687,10.1007/s00330-003-2070-x,Preoperative detection of hepatocellular carci...,The aim of this study was to compare Gd-DTPA-e...


In [13]:
# Store a xlsx format file
df_all_1.to_excel('Scraped_PubMed_Data(2004-2023).xlsx', index=False)

In [14]:
# Store a csv format file
df_all_1.to_csv('Scraped_PubMed_Data(2004-2023).csv', index=False)