# Using Entrex package to work with NCBI data

Small simple examples of downloading and working with NCBI data.
Requirements: biopython

In [18]:
import Bio.Entrez as etz
import pandas as pd

#### Best practise is to register the email, before querying NCBI
Otherwise, there will be a warning with each response that you have not registered

In [13]:
etz.email="oksana.korol@agr.gc.ca"

### List databases, available at NCBI

In [14]:
handler = etz.einfo()
response = handler.read()
handler.close()
response

'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eInfoResult PUBLIC "-//NLM//DTD einfo 20130322//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20130322/einfo.dtd">\n<eInfoResult>\n<DbList>\n\n\t<DbName>pubmed</DbName>\n\t<DbName>protein</DbName>\n\t<DbName>nuccore</DbName>\n\t<DbName>ipg</DbName>\n\t<DbName>nucleotide</DbName>\n\t<DbName>nucgss</DbName>\n\t<DbName>nucest</DbName>\n\t<DbName>structure</DbName>\n\t<DbName>sparcle</DbName>\n\t<DbName>genome</DbName>\n\t<DbName>annotinfo</DbName>\n\t<DbName>assembly</DbName>\n\t<DbName>bioproject</DbName>\n\t<DbName>biosample</DbName>\n\t<DbName>blastdbinfo</DbName>\n\t<DbName>books</DbName>\n\t<DbName>cdd</DbName>\n\t<DbName>clinvar</DbName>\n\t<DbName>clone</DbName>\n\t<DbName>gap</DbName>\n\t<DbName>gapplus</DbName>\n\t<DbName>grasp</DbName>\n\t<DbName>dbvar</DbName>\n\t<DbName>gene</DbName>\n\t<DbName>gds</DbName>\n\t<DbName>geoprofiles</DbName>\n\t<DbName>homologene</DbName>\n\t<DbName>medgen</DbName>\n\t<DbName>mesh</DbName>\n\t

In [15]:
handler = etz.einfo()
parsed_response = etz.read(handler)
handler.close()
parsed_response

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'nucgss', 'nucest', 'structure', 'sparcle', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'clone', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'probe', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'pubmedhealth', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'unigene', 'gencoll', 'gtr']}

### Query a database

In [25]:
handler = etz.einfo(db="nucleotide")
parsed_response = etz.read(handler)
handler.close()
parsed_response

{'DbInfo': {'DbName': 'nuccore', 'MenuName': 'Nucleotide', 'Description': 'Core Nucleotide db', 'DbBuild': 'Build180511-1800m.1', 'Count': '260055589', 'LastUpdate': '2018/05/12 21:29', 'FieldList': [{'Name': 'ALL', 'FullName': 'All Fields', 'Description': 'All terms from all searchable fields', 'TermCount': '4117657855', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'UID', 'FullName': 'UID', 'Description': 'Unique number assigned to each sequence', 'TermCount': '0', 'IsDate': 'N', 'IsNumerical': 'Y', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'Y'}, {'Name': 'FILT', 'FullName': 'Filter', 'Description': 'Limits the records', 'TermCount': '426', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'WORD', 'FullName': 'Text Word', 'Description': 'Free text associated with record', 'TermCount': '1870944156', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'Is

In [27]:
parsed_response.keys()

dict_keys(['DbInfo'])

In [28]:
parsed_response['DbInfo'].keys()

dict_keys(['DbName', 'MenuName', 'Description', 'DbBuild', 'Count', 'LastUpdate', 'FieldList', 'LinkList'])

In [35]:
pd.DataFrame(parsed_response['DbInfo']['FieldList'])

Unnamed: 0,Description,FullName,Hierarchy,IsDate,IsHidden,IsNumerical,Name,SingleToken,TermCount
0,All terms from all searchable fields,All Fields,N,N,N,N,ALL,N,4117657855
1,Unique number assigned to each sequence,UID,N,N,Y,Y,UID,Y,0
2,Limits the records,Filter,N,N,N,N,FILT,Y,426
3,Free text associated with record,Text Word,N,N,N,N,WORD,N,1870944156
4,Words in definition line,Title,N,N,N,N,TITL,N,140787446
5,Nonstandardized terms provided by submitter,Keyword,N,N,N,N,KYWD,Y,15637719
6,Author(s) of publication,Author,N,N,N,N,AUTH,Y,2773000
7,Journal abbreviation of publication,Journal,N,N,N,N,JOUR,Y,34914
8,Volume number of publication,Volume,N,N,N,N,VOL,Y,3691
9,Issue number of publication,Issue,N,N,N,N,ISS,Y,4219


In [36]:
pd.DataFrame(parsed_response['DbInfo']['LinkList'])

Unnamed: 0,DbTo,Description,Menu,Name
0,ccds,Link to Consensus CDS,ccds,nucleotide_ccds
1,genome,Genome record containing nucleotide sequence,Assembly to Genome,nucleotide_genome


## Queries

#### Get all fungal ITS sequences

In [38]:
query = "txid4751[Organism:exp] AND \"Internal Transcribed Spacer\"[All Fields]"
handler = etz.esearch(db='nucleotide', term=query)
parsed_response = etz.read(handler)
parsed_response

{'Count': '1075149', 'RetMax': '20', 'RetStart': '0', 'IdList': ['1388874334', '1388874333', '1388874332', '1388874331', '1388874330', '1388874329', '1388874328', '1388874327', '1386813114', '1386812756', '1386811116', '1386808931', '1386808869', '1386808066', '1386807225', '1386806404', '1386806368', '1386806366', '1278989629', '1268013305'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'txid4751[Organism:exp]', 'Field': 'Organism', 'Count': '7241877', 'Explode': 'Y'}, {'Term': '"Internal Transcribed Spacer"[All Fields]', 'Field': 'All Fields', 'Count': '1647071', 'Explode': 'N'}, 'AND'], 'QueryTranslation': 'txid4751[Organism:exp] AND "Internal Transcribed Spacer"[All Fields]'}

In [39]:
parsed_response.keys()

dict_keys(['Count', 'RetMax', 'RetStart', 'IdList', 'TranslationSet', 'TranslationStack', 'QueryTranslation'])

In [49]:
val = parsed_response['TranslationStack']
del val[-1]
pd.DataFrame(val)

Unnamed: 0,Count,Explode,Field,Term
0,7241877,Y,Organism,txid4751[Organism:exp]
1,1647071,N,All Fields,"""Internal Transcribed Spacer""[All Fields]"


In [50]:
parsed_response['QueryTranslation']

'txid4751[Organism:exp] AND "Internal Transcribed Spacer"[All Fields]'

In [51]:
pd.DataFrame(parsed_response['TranslationStack'])

Unnamed: 0,Count,Explode,Field,Term
0,7241877,Y,Organism,txid4751[Organism:exp]
1,1647071,N,All Fields,"""Internal Transcribed Spacer""[All Fields]"
