# Overview

# Table of contents

* [Imports](#imports)

# Imports<a id="imports"></a>

In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import pandas as pd
from pathlib import Path
import sys

**Import the PubMed module.**

In [29]:
cwd = Path.cwd()

In [30]:
module_dir = cwd.parent / 'scripts'

In [31]:
sys.path.append(str(module_dir))

In [32]:
import pubmed

# 1. Get literature from PubMed

**Scope**

We'll focus on getting 50 articles each for breast and lung cancers, from 2023-present, and remove articles of the type Review or Systematic Review.

In [33]:
bc_query = '''((("2023/01/01"[Date - Publication] : "3000"[Date - Publication])) 
NOT (Review[Publication Type] OR Scientific Integrity Review OR Systematic Review[Publication Type])) 
AND (breast cancer[Title/Abstract])'''

lc_query = '''((("2023/01/01"[Date - Publication] : "3000"[Date - Publication])) 
NOT (Review[Publication Type] OR Scientific Integrity Review OR Systematic Review[Publication Type])) 
AND (lung cancer[Title/Abstract])'''


**Get matching PubMed IDs (PMIDs) for breast and lung cancer.**

In [60]:
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'

database='pubmed'
# query='(breast+cancer[tiab]+OR+breast+carcinoma[tiab])'
# query = '2023/01/01:2024/01/01[pdat]'
query = '(breast+cancer[tiab]+OR+breast+carcinoma[tiab])+AND+(2023/01/01:3000[pdat])'
save_on_server='y'
search_format='json'
starting_index=0
max_records=3

search_suffix = (f'esearch.fcgi?db={database}'
                     f'&term={query}'
                     f'&usehistory={save_on_server}'
                     f'&retmode={search_format}'
                     f'&retstart={starting_index}'
                     f'&retmax={max_records}')

# Construct url
search_url = base_url + search_suffix

In [61]:
search_url

'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=(breast+cancer[tiab]+OR+breast+carcinoma[tiab])+AND+(2023/01/01:3000[pdat])&usehistory=y&retmode=json&retstart=0&retmax=3'

In [43]:


save_on_server = 'y'
search_format = 'json'
starting_index = 0
max_records = 3

bc_output = pubmed.get_pmids(query=bc_query,
                             save_on_server=save_on_server,
                             search_format=search_format,
                             starting_index=starting_index,
                             max_records=max_records)


Query string:((("2023/01/01"[Date - Publication] : "3000"[Date - Publication])) 
NOT (Review[Publication Type] OR Scientific Integrity Review OR Systematic Review[Publication Type])) 
AND (breast cancer[Title/Abstract])
The actual total number of records matching the search for is 23691
The number of ids retrieved using the esearch utility is 3
Function get_pmids complete.


In [44]:
bc_output

{'header': {'type': 'esearch', 'version': '0.3'},
 'esearchresult': {'count': '23691',
  'retmax': '3',
  'retstart': '0',
  'querykey': '1',
  'webenv': 'MCID_65e7631010c8237d0e20e7f2',
  'idlist': ['38441959', '38441847', '38441769'],
  'translationset': [{'from': 'Scientific Integrity Review',
    'to': '"scientific integrity review"[Publication Type] OR "scientific integrity review"[All Fields]'}],
  'querytranslation': '(2023/01/01:3000/12/31[Date - Publication] NOT ("Review"[Publication Type] OR ("scientific integrity review"[Publication Type] OR "scientific integrity review"[All Fields]) OR "systematic review"[Publication Type])) AND "breast cancer"[Title/Abstract]'}}

In [45]:
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
database = 'pubmed'
query_key = bc_output['esearchresult']['querykey']
web_env = bc_output['esearchresult']['webenv']
content_type = 'abstract'

fetchurl_suffix = f'efetch.fcgi?db={database}&query_key={query_key}&WebEnv={web_env}&rettype={content_type}&retmode=xml'


In [46]:
import requests

In [48]:
fetch_url = base_url + fetchurl_suffix


In [49]:
fetch_url

'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_65e7631010c8237d0e20e7f2&rettype=abstract&retmode=xml'

In [35]:
lc_output = pubmed.get_pmids(query=lc_query,
                             save_on_server=save_on_server,
                             search_format=search_format,
                             starting_index=starting_index,
                             max_records=max_records)


Query string:((("2023/01/01"[Date - Publication] : "3000"[Date - Publication])) 
NOT (Review[Publication Type] OR Scientific Integrity Review OR Systematic Review[Publication Type])) 
AND (lung cancer[Title/Abstract])
The actual total number of records matching the search for is 16238
The number of ids retrieved using the esearch utility is 5
Function get_pmids complete.


**Collect metadata about the search results.**

In [36]:
metadata_df1 = pubmed.query_and_ids(bc_output)

Metadata obtained and saved in a df.


In [37]:
metadata_df1.head()

Unnamed: 0,query_string,pmid,num_total_matches,num_retrieved_papers
0,(2023/01/01:3000/12/31[Date - Publication] NOT...,38441959,23691,5
1,(2023/01/01:3000/12/31[Date - Publication] NOT...,38441847,23691,5
2,(2023/01/01:3000/12/31[Date - Publication] NOT...,38441769,23691,5
3,(2023/01/01:3000/12/31[Date - Publication] NOT...,38441716,23691,5
4,(2023/01/01:3000/12/31[Date - Publication] NOT...,38441572,23691,5


In [38]:
metadata_df2 = pubmed.query_and_ids(lc_output)

Metadata obtained and saved in a df.


**Get title, abstracts, and some other details for all PMIDs in XML format.**

In [39]:
content_type='abstract'
ids_from_server=True                 
                  
bc_xml = pubmed.get_abstracts(search_output=bc_output,
                              content_type=content_type,
                              ids_from_server=ids_from_server)

Function get_abstracts complete.


In [19]:
content_type='abstract'
ids_from_server=True                 
                  
lc_xml = pubmed.get_abstracts(search_output=lc_output,
                              content_type=content_type,
                              ids_from_server=ids_from_server)

Function get_abstracts complete.


Confirm content in the xml strings.

In [41]:
bc_output

{'header': {'type': 'esearch', 'version': '0.3'},
 'esearchresult': {'count': '23691',
  'retmax': '5',
  'retstart': '0',
  'querykey': '1',
  'webenv': 'MCID_65e76236b21fee512418163d',
  'idlist': ['38441959', '38441847', '38441769', '38441716', '38441572'],
  'translationset': [{'from': 'Scientific Integrity Review',
    'to': '"scientific integrity review"[Publication Type] OR "scientific integrity review"[All Fields]'}],
  'querytranslation': '(2023/01/01:3000/12/31[Date - Publication] NOT ("Review"[Publication Type] OR ("scientific integrity review"[Publication Type] OR "scientific integrity review"[All Fields]) OR "systematic review"[Publication Type])) AND "breast cancer"[Title/Abstract]'}}

In [40]:
len(bc_xml)

193483738

In [21]:
len(lc_xml)

184297189

**Extract data from the XML into a df.**

In [22]:
bc_df = pubmed.get_data_from_xml(bc_xml)

Performing basic cleanup.
&#xa0 left: 0
&#x3ba left: 0
&# left: 44
Iterating through each article and collecting information.


TypeError: can only concatenate str (not "NoneType") to str

TypeError: unsupported operand type(s) for +: 'NoneType' and 'str'