# NCBI

Use the "Run" button to execute the code.

In [2]:
!pip install jovian --upgrade --quiet

In [3]:
import jovian

In [4]:
# Execute this to save new versions of the notebook
jovian.commit(project="NCBI Consolidated", filename="ncbi-consolidated")

[jovian] Detected Colab notebook...[0m
[jovian] Please enter your API key ( from https://jovian.ai/ ):[0m
API KEY: ··········
[jovian] Uploading colab notebook to Jovian...[0m
Committed successfully! https://jovian.ai/anubratadas/ncbi-consolidated


'https://jovian.ai/anubratadas/ncbi-consolidated'

In [5]:
import requests
import time
import xml.etree.ElementTree as ET
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# from PMC Database
url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" # base url
def get_values(): 
    db_list=["pmc","pubmed"]
    term=["mutation","mutation AND cancer","mutation AND phylogeny","mutation AND 'DNA repair'","mutation AND 'driver gene'",
         "mutation AND 'drug resistance'","'adaptive mutation' AND cancer","'adaptive mutation' AND phylogeny",
         "'adaptive mutation' AND 'drug resistance'","cancer AND pathways AND mutations","cancer AND 'pathway analysis'",
         "cancer AND 'drug resistance'","'driver gene' AND 'drug resistance'"]
    count_dict={}
    count_dict['term']=term # populate the terms in the dictionary
    for db in db_list:     
        response_lst=[]
        for item in term:
            time.sleep(1)
            try:
                payload={"db":db,"term":item,"retype":"count","retmax":1000,"email":"anubratadas@gmail.com","api_key":"9577c7078191e23dfdd5015746af31abf708" } # parameters
                response=requests.get(url,params=payload) # response object
                tree=ET.fromstring(response.content) # tree obtained directly from response.content using fromstring
                count=tree.find('Count').text
                print(count)
                response_lst.append(count)                
            except error:
                print(error)  
        count_dict[db]=response_lst   
    #mutation_dict= {"term":term,str(db):response_lst}    
    return  count_dict # return count of listings
                   
count_dict=get_values()

1014209
472077
58006
0
1554
844
515
68
17
366428
2901
979
20
953072
250638
19403
23
64
34
35
0
0


In [None]:
df_mutation=pd.DataFrame(count_dict)
df_mutation['pmc']=pd.to_numeric(df_mutation['pmc'])
df_mutation['pubmed']=pd.to_numeric(df_mutation['pubmed'])
df_mutation.dtypes

In [None]:
df_mutation

In [None]:
# melt the dataframe to long format to unpivot the databases
# rename the columns, semicolon suppreses output
df_mutation_melted=pd.melt(df_mutation,id_vars=['term'],value_vars=['pmc','pubmed'])
df_mutation_melted=df_mutation_melted.rename(columns={"variable":"database","value":"records"}); 

In [None]:
df_mutation_melted['records']=df_mutation_melted['records'].map(lambda x: np.log(x))
df_mutation_melted.head()

In [None]:
plt.figure(figsize=(12, 6))
plt.title('')
sns.barplot(data=df_mutation_melted,x='term',y='records',hue='database');
plt.xticks(rotation=70)
plt.tight_layout()

In [None]:
# from PMC Database
url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" # base url
def get_efetch_uids(item): 
    uid_lst=[] # list object to collect the udds generated by Esearch    
    try:
        payload={"db":"pmc","term":item,"retype":"count","retmax":1000,"email":"anubratadas@gmail.com","api_key":"9577c7078191e23dfdd5015746af31abf708" } # parameters
        response=requests.get(url,params=payload) # response object
        response.raise_for_status()
        root=ET.fromstring(response.content) # tree obtained directly from response.content using fromstring
        id_list=root.find('IdList')   
        return id_list
    except requests.exceptions.HTTPError as err:
        raise SystemExit(err)
    except requests.exceptions.TooManyRedirects:
        print("please try a different URL")
            
def call_efetch():
    term=["mutation AND 'driver gene'","cancer AND 'drug resistance'","mutation AND 'drug resistance'","'driver gene' AND 'drug resistance'",
         "'driver gene' AND cancer","'adaptive mutation' AND 'drug resistance'","cancer AND pathways AND mutations","cancer AND 'pathway analysis'",
         "cancer AND 'drug resistance' AND 'driver gene'","'DNA repair' AND 'drug resistance'"]
    uid_list=[]
    for item in term:
        time.sleep(1)
        id_list=get_efetch_uids(item)        
        for id_num in id_list:
            uid_list.append(id_num.text)
    uid=",".join(uid_list) 
    print(f' {len(uid_list)} records retrieved')
    return uid
uid=call_efetch()

In [None]:
# Esummary with the string variable from Esearch passed to id parameter of Esummary
# since we have more than 200 records we have to use post
url_summ="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
payload={"db":"pmc","id":uid,"email":"anubratadas@gmail.com","api_key":"9577c7078191e23dfdd5015746af31abf708" } 
response_esumm_uid=requests.get(url_summ,params=payload) # response object
esummary_root_et=ET.fromstring(response_esumm_uid.content)

In [None]:
for child in ET.fromstring(response_esumm_uid.content):
    for subchild in child:
        print(subchild.text)

as the list of files were large in number, the server refused connection

now we will chunk up the list of files into units of 1000

In [None]:
# this function creates sublists of 200 entries each for getting esummary records

def chunk_list():
    master_list_=[]
    add_list=[]  # create a temporary list variable
    new_uid_list=uid.split(",") # create a list of UIDs back from the string generated earlier by merging the UIDs
    print(len(new_uid_list))
    a=200 # select the chunk size
    while len(new_uid_list)>=0:  # check that input list is nonzero
        add_list=new_uid_list[:a]   # append the first 200 entries from input to the temporary list variable
        del new_uid_list[:a]        # delete the entries transferred to the temporary list from the input list   
        master_list_.append(add_list)    
        if len(new_uid_list)<=0:
            break   
    return master_list_         
master_list=chunk_list()

In [None]:
master_list[0][1:10]

In [None]:
# this code snippet will change the UIDs in the sublist back into strings for searching with esummary as a parameter
def uid_string_convert(master_list):
    uid_string_list=[]
    for sublist in master_list:
        str_uid=",".join(sublist)
        uid_string_list.append(str_uid)
    print(len(uid_string_list))
    return uid_string_list


def get_summary():
    uid_string_list=uid_string_convert(master_list)
    for uid_string in uid_string_list:    
        url_summ="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
        payload={"db":"pmc","id":uid_string,"email":"anubratadas@gmail.com","api_key":"9577c7078191e23dfdd5015746af31abf708" } 
        time.sleep(1)                                              # server throttle
        response_esumm_uid=requests.get(url_summ,params=payload)   # response object
        esummary_root_et=ET.fromstring(response_esumm_uid.content) # get content
        docsum=esummary_root_et.findall('DocSum')                  # find document
        return docsum

def parse_docsum():
    title_dict={"uid":[],"title":[]}
    title_list=[]
    uid_list=[]
    docsum_list=get_summary()
    for entry in docsum_list:
        id_=entry.find('Id')
        #print(id.text)
        uid_list.append(id_.text)
        title=entry.find('Item[@Name="Title"]')
        #print(title.text)
        title_list.append(title.text)
    title_dict["title"]=title_list
    title_dict["uid"]=uid_list
    return title_dict

title_dict=parse_docsum()
search_title=pd.DataFrame(title_dict)
search_title_csv=search_title.to_csv("ncbi_search_titles_pmc.csv")

In [None]:
search_title

In [None]:
# Esummary with the string variable from Esearch passed to id parameter of Esummary
# since we have more than 200 records we have to use post
url_summ="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
payload={"db":"pubmed","id":"8021012","email":"anubratadas@gmail.com","api_key":"9577c7078191e23dfdd5015746af31abf708" } 
response_esumm_uid=requests.post(url_summ,params=payload) # response object
esummary_root_et=ET.fromstring(response_esumm_uid.content)

In [None]:
esummary_root_et.find('DocSum/Item[@Name="Title"]').text

In [None]:
esummary_root_et.find('DocSum/Id').text

In [None]:
jovian.commit(project="NCBI Consolidated", filename="ncbi-consolidated")

#### now we shall try to get the records

In [None]:
fetch_url=",".join(master_list[0])

In [None]:
fetch_url

In [None]:
response_efetch=requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi",params={"db":"pmc","id":'6650054',"format":"xml"})

In [None]:
print(response_efetch.raise_for_status())

In [None]:
!conda install lxml --yes --quiet

In [None]:
from lxml import etree
from io import StringIO, BytesIO

In [None]:
root=etree.parse(StringIO(response_efetch.text))

In [None]:
# view the layout of the tree objectobject
print(etree.tostring(root))

In [None]:
# using element tree
title_from_element_tree=[element for element in root.iter() if element.tag=='article-title'][0].text

In [None]:
title=root.xpath('//article-title/text()')[0]
title

In [None]:
# using element tree
for element in root.iter():
    if element.tag=='abstract':
        abstract_tag=element       

In [None]:
for sub_element in abstract_tag:
    if sub_element.tag=='p':
        print(sub_element.text) # the formatting for POGZ disrupts the text formatting

In [None]:
# using xpath
abstract_list=root.xpath('//abstract//p//text()') # the formatting for POGZ disrupts the text formatting
abstract="".join(abstract_list)                   # the list obtained is combined as text  
abstract

In [None]:
conclusions

In [None]:
for section in conclusions:
    for element in section.iter():
        if element.tag=="Discussion":
            print(element.text)
       

In [None]:
for section in conclusions:
    for element in section.iter("title"):       
            print(element.text)       

In [None]:
discussion=""
for section in conclusions:
    if section.findtext('title')=="Discussion":
        for element in section.getchildren():
            if element.tag=='p':
                discussion=discussion+element.text

In [None]:
discussion