# 0 IMPORTS

In [1]:
import numpy as np
import pandas as pd
import json

In [2]:
!pip install Bio



In [3]:
from Bio import Entrez

# 1 HELPER FUNCTIONS

In [4]:
def search(query,retrive):
  Entrez.email = 'naveenverse@gmail.com'
  handle = Entrez.esearch(
      db = 'pubmed',
      sort = 'relevance',
      retmax = retrive,
      retmode = 'xml',
      term = query
  )
  results = Entrez.read(handle)
  return results #The list of citation IDs will be available as results[‘IdList’]

def fetch_details(id_list):
  Entrez.email = 'naveenverse@gmail.com'
  ids = ','.join(id_list)
  handle = Entrez.efetch(
        db = 'pubmed',
        retmode = 'xml',
        id = ids
    ) #fetch the details for all the retrieved articles via the efetch utility
  results = Entrez.read(handle)
  return results 
def json_extract(papers, Key):
  Abstract = []
  pmID = []  
  Keyword = []
  #extracting nested data from JSON
  for i, paper in enumerate(papers['PubmedArticle']):
          pmID.append((json.dumps(paper['MedlineCitation']['PMID']).strip('"')))
          Keyword.append(Key)
          try : Abstract.append((json.dumps(paper['MedlineCitation']['Article']['Abstract']['AbstractText']).strip('[]"')))
          except : Abstract.append('No abstract')
  return Abstract,pmID,Keyword 
  

# 2 RETRIVING THE DATA

Retriving Cancer related papers

In [5]:
results = search('cancer','200')
id_list = results['IdList']
cancer_papers = fetch_details(id_list)
# Print the first paper in full to observe its structure
print(json.dumps(cancer_papers['PubmedArticle'][0], indent=3))

{
   "MedlineCitation": {
      "CitationSubset": [
         "IM"
      ],
      "SpaceFlightMission": [],
      "OtherAbstract": [],
      "GeneralNote": [],
      "OtherID": [],
      "KeywordList": [],
      "PMID": "31622130",
      "DateCompleted": {
         "Year": "2020",
         "Month": "06",
         "Day": "22"
      },
      "DateRevised": {
         "Year": "2020",
         "Month": "07",
         "Day": "14"
      },
      "Article": {
         "Language": [
            "eng"
         ],
         "ELocationID": [
            "10.1200/JCO.19.00129"
         ],
         "ArticleDate": [
            {
               "Year": "2019",
               "Month": "10",
               "Day": "17"
            }
         ],
         "Journal": {
            "ISSN": "1527-7755",
            "JournalIssue": {
               "Volume": "37",
               "Issue": "34",
               "PubDate": {
                  "Year": "2019",
                  "Month": "12",
                  "Day"

In [6]:
cancer_lists = json_extract(cancer_papers, 'Cancer')

Retriving Alzheimer's Disease related papers

In [7]:
results = search('alzheimers', '100')
id_list = results['IdList']
AD_papers = fetch_details(id_list)
AD_lists = json_extract(AD_papers, 'Alzheimers Disease')

Retriving Diabetes related papers

In [8]:
results = search('diabetes', '150')
id_list = results['IdList']
diabetes_papers = fetch_details(id_list)
diabetes_lists = json_extract(diabetes_papers, 'Diabetes')

Retriving Parkinson's Disease related papers

In [9]:
results = search('parkinsons', '50')
id_list = results['IdList']
PD_papers = fetch_details(id_list)
PD_lists = json_extract(PD_papers, 'Parkinsons Disease')

Retriving Obesity related papers

In [10]:
results = search('Obesity', '120')
id_list = results['IdList']
obesity_papers = fetch_details(id_list)
obesity_lists = json_extract(obesity_papers, 'Obesity')

Retriving COVID related papers

In [11]:
results = search('Covid', '120')
id_list = results['IdList']
covid_papers = fetch_details(id_list)
covid_lists = json_extract(covid_papers, 'COVID')

In [12]:
Abstract = []
pmID = []  
Keyword = []

In [13]:
Abstract = cancer_lists[0] + AD_lists[0] + diabetes_lists[0] + PD_lists[0] + obesity_lists[0] + covid_lists[0]
pmID = cancer_lists[1] + AD_lists[1] + diabetes_lists[1] + PD_lists[1] + obesity_lists[1] + covid_lists[1]
Keyword = cancer_lists[2] + AD_lists[2] + diabetes_lists[2] + PD_lists[2] + obesity_lists[2] + covid_lists[2]

# 3 DATAFRAME

In [14]:
data_frame = pd.DataFrame(columns= ['PUBMED_ID','Abstract','Keyword'])
data_frame.head() # Create a dataframe using pandas

Unnamed: 0,PUBMED_ID,Abstract,Keyword


In [15]:
data_frame.PUBMED_ID = pmID
data_frame.Abstract = Abstract
data_frame.Keyword = Keyword
data_frame.head() 

Unnamed: 0,PUBMED_ID,Abstract,Keyword
0,31622130,Therapeutic radiation in childhood cancer has ...,Cancer
1,32682319,The aim of the study was to characterize subse...,Cancer
2,32142393,To evaluate childhood cancer survivors' adhere...,Cancer
3,32469081,Birth defects are established risk factors for...,Cancer
4,31337641,Obesity is a known modifiable risk factor asso...,Cancer


In [16]:
data_frame.Keyword.value_counts() # number of data collected in different topics

Cancer                200
Diabetes              150
COVID                 120
Obesity               120
Alzheimers Disease    100
Parkinsons Disease     50
Name: Keyword, dtype: int64

In [21]:
data_frame.to_csv('PubMed_data.csv', index=False)