### Setup.

In [1]:
from Bio import Entrez
Entrez.email = "sosa@uma.es"
Entrez.tool = "Biopython presentation notebook"
# Entrez.api_key = "5cfd4026f9df285d6cfc723c662dxxxxxxxx"

### Einfo: Obtaining information about the Entrez databases

Printing the XML response.

In [2]:
handle = Entrez.einfo()
result = handle.read()
handle.close()
print(result)

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE eInfoResult PUBLIC "-//NLM//DTD einfo 20130322//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20130322/einfo.dtd">
<eInfoResult>
<DbList>

	<DbName>pubmed</DbName>
	<DbName>protein</DbName>
	<DbName>nuccore</DbName>
	<DbName>ipg</DbName>
	<DbName>nucleotide</DbName>
	<DbName>nucgss</DbName>
	<DbName>nucest</DbName>
	<DbName>structure</DbName>
	<DbName>sparcle</DbName>
	<DbName>genome</DbName>
	<DbName>annotinfo</DbName>
	<DbName>assembly</DbName>
	<DbName>bioproject</DbName>
	<DbName>biosample</DbName>
	<DbName>blastdbinfo</DbName>
	<DbName>books</DbName>
	<DbName>cdd</DbName>
	<DbName>clinvar</DbName>
	<DbName>clone</DbName>
	<DbName>gap</DbName>
	<DbName>gapplus</DbName>
	<DbName>grasp</DbName>
	<DbName>dbvar</DbName>
	<DbName>gene</DbName>
	<DbName>gds</DbName>
	<DbName>geoprofiles</DbName>
	<DbName>homologene</DbName>
	<DbName>medgen</DbName>
	<DbName>mesh</DbName>
	<DbName>ncbisearch</DbName>
	<DbName>nlmcatalog</DbName>
	<DbName

Parsing the XML into a python dictionary.

In [3]:
with Entrez.einfo() as handle:
    print(Entrez.read(handle))

{'DbList': ['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'nucgss', 'nucest', 'structure', 'sparcle', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'clone', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'probe', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'pubmedhealth', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'unigene', 'gencoll', 'gtr']}


We can retrieve more information of every database.

In [4]:
with Entrez.einfo(db="pubmed") as handle:
    print(Entrez.read(handle))

{'DbInfo': {'DbName': 'pubmed', 'MenuName': 'PubMed', 'Description': 'PubMed bibliographic record', 'DbBuild': 'Build180604-2212m.3', 'Count': '28512410', 'LastUpdate': '2018/06/05 15:38', 'FieldList': [{'Name': 'ALL', 'FullName': 'All Fields', 'Description': 'All terms from all searchable fields', 'TermCount': '201334348', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'UID', 'FullName': 'UID', 'Description': 'Unique number assigned to publication', 'TermCount': '0', 'IsDate': 'N', 'IsNumerical': 'Y', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'Y'}, {'Name': 'FILT', 'FullName': 'Filter', 'Description': 'Limits the records', 'TermCount': '11708', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'TITL', 'FullName': 'Title', 'Description': 'Words in title of publication', 'TermCount': '17003536', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden'

### ESearch: Searching the Entrez databases

In [5]:
with Entrez.esearch(db="nucleotide", term="Pinus pinaster GS1b glutamine synthetase") as handle:
    ids = Entrez.read(handle)
ids

{'Count': '3', 'RetMax': '3', 'RetStart': '0', 'IdList': ['1153959381', '440586435', '426263228'], 'TranslationSet': [{'From': 'Pinus pinaster', 'To': '"Pinus pinaster"[Organism] OR Pinus pinaster[All Fields]'}], 'TranslationStack': [{'Term': '"Pinus pinaster"[Organism]', 'Field': 'Organism', 'Count': '12293', 'Explode': 'Y'}, {'Term': 'Pinus pinaster[All Fields]', 'Field': 'All Fields', 'Count': '13963', 'Explode': 'N'}, 'OR', 'GROUP', {'Term': 'GS1b[All Fields]', 'Field': 'All Fields', 'Count': '12', 'Explode': 'N'}, 'AND', {'Term': 'glutamine synthetase[All Fields]', 'Field': 'All Fields', 'Count': '432024', 'Explode': 'N'}, 'AND', 'GROUP'], 'QueryTranslation': '("Pinus pinaster"[Organism] OR Pinus pinaster[All Fields]) AND GS1b[All Fields] AND glutamine synthetase[All Fields]'}

###  ESummary: Retrieving summaries from primary IDs

Retrieve summary of every result in 'Pinus pinaster GS1b glutamine synthetase' search.

In [6]:
Entrez.read(Entrez.esummary(db="nucleotide", id=",".join(ids["IdList"])))

[{'Item': [], 'Id': '1153959381', 'Caption': 'KU641798', 'Title': 'Pinus pinaster cytosolic glutamine synthetase GS1b (GS1b) mRNA, complete cds', 'Extra': 'gi|1153959381|gb|KU641798.1|[1153959381]', 'Gi': 1153959381, 'CreateDate': '2018/02/28', 'UpdateDate': '2018/02/28', 'Flags': 0, 'TaxId': 71647, 'Length': 1435, 'Status': 'live', 'ReplacedBy': '', 'Comment': '  ', 'AccessionVersion': 'KU641798.1'}, {'Item': [], 'Id': '440586435', 'Caption': 'HE866753', 'Title': 'Pinus pinaster GS1b gene promoter region', 'Extra': 'gi|440586435|emb|HE866753.1|[440586435]', 'Gi': 440586435, 'CreateDate': '2013/01/07', 'UpdateDate': '2013/01/07', 'Flags': 0, 'TaxId': 71647, 'Length': 1209, 'Status': 'live', 'ReplacedBy': '', 'Comment': '  ', 'AccessionVersion': 'HE866753.1'}, {'Item': [], 'Id': '426263228', 'Caption': 'HF548531', 'Title': 'Pinus pinaster GS1b gene for glutamine synthetase', 'Extra': 'gi|426263228|emb|HF548531.1|[426263228]', 'Gi': 426263228, 'CreateDate': '2012/12/02', 'UpdateDate': '2

###  EFetch: Downloading full records from Entrez

In [7]:
handle = Entrez.efetch(db="nucleotide", id=",".join(ids["IdList"]), rettype="fasta", retmode="text")
print(handle.read())

>KU641798.1 Pinus pinaster cytosolic glutamine synthetase GS1b (GS1b) mRNA, complete cds
CTTCCTCAGGTCGGGCTTGCCCTTTGCATCAATTGCTATAAATTCTTATTTCAGTGGCCTTTATTTCGAA
ATAGCAGATCAAAGGCCTTCACTGCTTGCAGAATTATACTTGTGCGGGAGTCTGTGATTTTGTTGTACAT
CCAAGATGTCTCTACTGACGGATTTGATCAACTTGGACCTCTCTGATGTCACTGAGAAGATCATCGCTGA
GTACATATGGATCGGAGGCTCTGGCATGGATATCCGCAGCAAGGCCAGGACCTTATCTCACCCAGTTACG
GACCCCAAAGATCTACCCAAGTGGAATTATGATGGATCCAGTACTGGACAGGCTCCTGGAAAGGATAGTG
AAGTCATCCTTTACCCTCAGGCTATCTTCAGGGATCCATTCCGCAGGGGGAACAACATCTTGGTGATTTG
TGATACATATACCCCAGCTGGAGAACCTATTCCTACTAACAAGAGAGCAAATGCTGCAAAAATATTTAGC
CATCCCGATGTTGTTGTCGAGGAACCATGGTACGGGATTGAACAAGAATACACTCTTCTGCAAAAGGATG
TGAATTGGCCTCTTGGATGGCCCGTAGGTGGTTACCCTGGTCCTCAGGGTCCTTATTATTGTGGAACTGG
AGCAGACAAAGCCTACGGCCGTGATATCGTCGATGCCCACTATAAGGCTTGCCTGTATGCAGGAATCAAC
ATTAGTGGCATCAATGGAGAAGTCATGCCCGGTCAATGGGAATTTCAAGTTGGCCCGACGGTTGGTATTT
CATCTGGTGATCAAGTCTGGGCTGCACGTTACCTTCTTGAGAGAATCACAGAAGTGGCTGGTGTTGTCCT
CTCATTTGACCCCAAACCCATTCAGGGTGATTGGAATGGTGCTGGTGCTCACACTAACT

###  ELink: Searching for related items in NCBI Entrez

In [8]:
record = Entrez.read(Entrez.elink(dbfrom="pubmed", id="19304878")) # The biopython article in pubmed.
for linksetdb in record[0]["LinkSetDb"]:
    print(linksetdb["DbTo"], linksetdb["LinkName"], len(linksetdb["Link"]))

pubmed pubmed_pubmed 193
pubmed pubmed_pubmed_alsoviewed 5
pubmed pubmed_pubmed_citedin 547
pubmed pubmed_pubmed_combined 6
pubmed pubmed_pubmed_five 6
pubmed pubmed_pubmed_refs 17
pubmed pubmed_pubmed_reviews 8
pubmed pubmed_pubmed_reviews_five 6


In [9]:
related = record[0]["LinkSetDb"][0]["Link"][1]["Id"]
related

'14630660'

In [10]:
result = Entrez.read(Entrez.esummary(db="pubmed", id=related))
result[0]["Title"]

'PDB file parser and structure class implemented in Python.'

### EGQuery: Global Query - counts for search terms

In [11]:
handle = Entrez.egquery(term="rosetta software")
record = Entrez.read(handle)
for row in record["eGQueryResult"]:
     print(row)

{'DbName': 'pubmed', 'MenuName': 'PubMed', 'Count': '267', 'Status': 'Ok'}
{'DbName': 'pmc', 'MenuName': 'PubMed Central', 'Count': '9349', 'Status': 'Ok'}
{'DbName': 'mesh', 'MenuName': 'MeSH', 'Count': '5', 'Status': 'Ok'}
{'DbName': 'books', 'MenuName': 'Books', 'Count': '13', 'Status': 'Ok'}
{'DbName': 'pubmedhealth', 'MenuName': 'PubMed Health', 'Count': '7', 'Status': 'Ok'}
{'DbName': 'omim', 'MenuName': 'OMIM', 'Count': '0', 'Status': 'Term or Database is not found'}
{'DbName': 'ncbisearch', 'MenuName': 'Site Search', 'Count': '0', 'Status': 'Term or Database is not found'}
{'DbName': 'nuccore', 'MenuName': 'Nucleotide', 'Count': '0', 'Status': 'Term or Database is not found'}
{'DbName': 'nucgss', 'MenuName': 'GSS', 'Count': '438363', 'Status': 'Ok'}
{'DbName': 'nucest', 'MenuName': 'EST', 'Count': '0', 'Status': 'Term or Database is not found'}
{'DbName': 'protein', 'MenuName': 'Protein', 'Count': '20', 'Status': 'Ok'}
{'DbName': 'genome', 'MenuName': 'Genome', 'Count': '0', 'S

###  ESpell: Obtaining spelling suggestion

In [12]:
handle = Entrez.espell(term="rosseta software")
record = Entrez.read(handle)
record["Query"]

'rosseta software'

In [13]:
record["CorrectedQuery"]

'rosetta software'

### EPost: UID uploads

In [15]:
ids_post = ",".join(ids["IdList"])
handle = Entrez.epost("nucleotide", id=ids_post)
session = Entrez.read(handle)
webenv = session["WebEnv"]
query_key = session["QueryKey"]
handle = Entrez.efetch(db="nucleotide", id=ids["IdList"], retmode="text",  rettype="fasta", webenv=webenv, query_key=query_key)
record = handle.read()
handle.close()
print(record)

>KU641798.1 Pinus pinaster cytosolic glutamine synthetase GS1b (GS1b) mRNA, complete cds
CTTCCTCAGGTCGGGCTTGCCCTTTGCATCAATTGCTATAAATTCTTATTTCAGTGGCCTTTATTTCGAA
ATAGCAGATCAAAGGCCTTCACTGCTTGCAGAATTATACTTGTGCGGGAGTCTGTGATTTTGTTGTACAT
CCAAGATGTCTCTACTGACGGATTTGATCAACTTGGACCTCTCTGATGTCACTGAGAAGATCATCGCTGA
GTACATATGGATCGGAGGCTCTGGCATGGATATCCGCAGCAAGGCCAGGACCTTATCTCACCCAGTTACG
GACCCCAAAGATCTACCCAAGTGGAATTATGATGGATCCAGTACTGGACAGGCTCCTGGAAAGGATAGTG
AAGTCATCCTTTACCCTCAGGCTATCTTCAGGGATCCATTCCGCAGGGGGAACAACATCTTGGTGATTTG
TGATACATATACCCCAGCTGGAGAACCTATTCCTACTAACAAGAGAGCAAATGCTGCAAAAATATTTAGC
CATCCCGATGTTGTTGTCGAGGAACCATGGTACGGGATTGAACAAGAATACACTCTTCTGCAAAAGGATG
TGAATTGGCCTCTTGGATGGCCCGTAGGTGGTTACCCTGGTCCTCAGGGTCCTTATTATTGTGGAACTGG
AGCAGACAAAGCCTACGGCCGTGATATCGTCGATGCCCACTATAAGGCTTGCCTGTATGCAGGAATCAAC
ATTAGTGGCATCAATGGAGAAGTCATGCCCGGTCAATGGGAATTTCAAGTTGGCCCGACGGTTGGTATTT
CATCTGGTGATCAAGTCTGGGCTGCACGTTACCTTCTTGAGAGAATCACAGAAGTGGCTGGTGTTGTCCT
CTCATTTGACCCCAAACCCATTCAGGGTGATTGGAATGGTGCTGGTGCTCACACTAACT