# Overview

# Table of contents

* [Imports](#imports)

# Imports<a id="imports"></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
import sys

**Import the PubMed module.**

In [3]:
cwd = Path.cwd()

In [4]:
module_dir = cwd.parent / 'scripts'

In [5]:
sys.path.append(str(module_dir))

In [6]:
import pubmed

# Paths

In [13]:
output_dir = cwd.parent / 'outputs'

**Path to CSV containing literature from 87 papers from PubMed.**

In [16]:
pubmed_output = output_dir / 'pubmed_data.csv'

# 1. Get literature from PubMed

**Scope**

We'll focus on getting ~30 articles each for breast cancer,lung cancer, and glioblastoma from 2023-present, and remove articles of the type Review or Systematic Review.

**Establish base criteria for querying.**

In [7]:
date_criteria = '(2023/01/01:3000[pdat])'
drug_criteria = '(drug[tiab]+OR+inhibitor[tiab]+OR+compound[tiab]+OR+small+molecule[tiab]+OR+clinical+trial[tiab]+OR+therapy[tiab]+OR+agent[tiab])'
pub_criteria = '(Review[pt]+OR+Scientific+Integrity+Review[pt]+OR+Systematic+Review[pt])'


**Run the PubMed pipeline for getting papers for 3 cancer types.**

In [8]:
# Collect all dfs to merge later
all_dfs = []

for cancer_type in ['breast+cancer','lung+cancer','glioblastoma']:
    disease_criteria = f'({cancer_type}[tiab])'
    
    query = date_criteria + '+AND+' + disease_criteria + '+AND+' + drug_criteria + '+NOT+' + pub_criteria
    
    df = pubmed.run_pubmed_pipeline(query=query,
                                    save_on_server='y',
                                    search_format='json',
                                    search_starting_index=0,
                                    search_max_records=9999,
                                    sorting_criteria='relevance',
                                    content_type='abstract',
                                    fetch_starting_index=0,
                                    fetch_max_records=30)
    
    # Add an identifier column for cancer type for easy searching
    df['disease'] = cancer_type
    print(f'Num rows in df: {len(df)}')
    all_dfs.append(df)
    
    print(f'Pipeline complete for {cancer_type}')

# Combine all dfs
final_df = pd.concat(all_dfs)

----Running pipeline for the following query:----
(2023/01/01:3000[pdat])+AND+(breast+cancer[tiab])+AND+(drug[tiab]+OR+inhibitor[tiab]+OR+compound[tiab]+OR+small+molecule[tiab]+OR+clinical+trial[tiab]+OR+therapy[tiab]+OR+agent[tiab])+NOT+(Review[pt]+OR+Scientific+Integrity+Review[pt]+OR+Systematic+Review[pt])
Using PubMed esearch API to get PMIDs matching the search query.
	The actual total number of records matching the search for is 9403
	The number of ids present in the esearch json is 9403
	Function get_pmids complete.
Collecting metadata about the search results into a dictionary.
	Metadata obtained and saved in a dictionary.
Using PubMed efetch API to get abstract and other details for relevant PMIDs into an XML string.
	The number of matching PMIDs based on the server: 30
	Function get_abstracts complete.
Extracting data from XML string and organizing it into a dataframe.
	Performing basic cleanup.
	&#xa0 left: 0
	&#x3ba left: 0
	&# left: 0
Iterating through each article and col

In [9]:
len(final_df)

87

**Final dataframe ready with content extracted from PubMed.**

In [15]:
final_df

Unnamed: 0,pmid,publication_date,publication_type,article_title,abstract,keywords,journal,num_abstracts_retrieved,num_abstracts_requested,query_string,num_total_matches,all_matching_pmids,acquisition_date,disease
0,37256976,2023 Jun 01,"Clinical Trial, Phase III|Journal Article|Rand...",Capivasertib in Hormone Receptor-Positive Adva...,[BACKGROUND]AKT pathway activation is implicat...,,The New England journal of medicine,30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,9403,"37256976,37557181,37070653,37147285,37723305,3...",2024-03-06,breast+cancer
1,37557181,2023 Oct 19,"Journal Article|Research Support, Non-U.S. Gov't","Discovery of a highly potent, selective, orall...","KAT6A, and its paralog KAT6B, are histone lysi...",CTx-648|KAT6A|KAT6B|PF-9363|breast cancer|cell...,Cell chemical biology,30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,9403,"37256976,37557181,37070653,37147285,37723305,3...",2024-03-06,breast+cancer
2,37070653,2023 Mar,Clinical Trial Protocol|Journal Article,"Design of SERENA-6, a phase III switching tria...",ESR1 mutation (ESR1m) is a frequent cause of a...,ESR1 mutation|advanced breast cancer|camizestr...,"Future oncology (London, England)",30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,9403,"37256976,37557181,37070653,37147285,37723305,3...",2024-03-06,breast+cancer
3,37147285,2023 May 05,"Journal Article|Research Support, Non-U.S. Gov't",KK-LC-1 as a therapeutic target to eliminate ALDH,Failure to achieve complete elimination of tri...,,Nature communications,30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,9403,"37256976,37557181,37070653,37147285,37723305,3...",2024-03-06,breast+cancer
4,37723305,2023 Oct,Journal Article,Acetate acts as a metabolic immunomodulator by...,Acetate metabolism is an important metabolic p...,,Nature cancerMain References:Methods Only Refe...,30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,9403,"37256976,37557181,37070653,37147285,37723305,3...",2024-03-06,breast+cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,37886538,2023 Oct 05,Preprint,LDHA-regulated tumor-macrophage symbiosis prom...,Abundant macrophage infiltration and altered t...,,Research square,30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,1817,"36791206,37451272,36749723,37935665,38215747,3...",2024-03-06,glioblastoma
26,37417769,2023 Oct,Journal Article,Engineering and Characterization of an Artific...,Glioblastoma multiforme (GBM) treatment is hin...,blood-brain barrier|engineered artificial vesi...,"Advanced materials (Deerfield Beach, Fla.)",30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,1817,"36791206,37451272,36749723,37935665,38215747,3...",2024-03-06,glioblastoma
27,37147437,2023 Jun,Journal Article,EZH2-Myc driven glioblastoma elicited by cytom...,Mounting evidence is identifying human cytomeg...,,Oncogene,30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,1817,"36791206,37451272,36749723,37935665,38215747,3...",2024-03-06,glioblastoma
28,37572644,2023 Sep,Journal Article,Combination drug screen targeting glioblastoma...,[BACKGROUND]Pharmacological synergisms are an ...,Cancer vulnerabilities|Drug combination screen...,EBioMedicine,30,30,(2023/01/01:3000/12/31[Date - Publication] AND...,1817,"36791206,37451272,36749723,37935665,38215747,3...",2024-03-06,glioblastoma


**Save the df as a CSV.**

In [17]:
final_df.to_csv(pubmed_output,index=False)