In [2]:
# install biopython on Jupyter server.
import sys
!pip install biopython

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/3c/5a/05c29e3a75269e00e50779b28b60041d9783ee79054ce53a1d193e05deab/biopython-1.73-cp35-cp35m-manylinux1_x86_64.whl (2.2MB)
[K    100% |████████████████████████████████| 2.2MB 709kB/s eta 0:00:01
Installing collected packages: biopython
Successfully installed biopython-1.73
[33mYou are using pip version 18.1, however version 19.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [37]:
import time
from Bio import Entrez

In [38]:
Entrez.email = "lonphan@nih.gov" # user email
# API key from NCBI (https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/). 
# 10 queries per second a valid API key, otherwise 3 queries per seconds are allowed for 'None'
Entrez.api_key = None

# entrez query (term) can be build and test online using web query builder (https://www.ncbi.nlm.nih.gov/snp/advanced) 
# esearch handle
eShandle = Entrez.esearch(db="snp",  # search dbSNP
                          term='LPL[All Fields] AND (pathogenic[Clinical_Significance] AND missense[Function_Class])', # search for gene LPL
                          usehistory="y", #cache result on server for download in batches
                          retmax=20 # return 20 RSID max
                         )


In [39]:

# get esearch result
eSresult = Entrez.read(eShandle)

In [40]:
# review results 
for k in eSresult:
    print (k, ":", eSresult[k])
    
#Output: Web environment (&WebEnv) and query key (&query_key) parameters specifying the location on the Entrez history server of the list of UIDs matching the Entrez query
#https://www.ncbi.nlm.nih.gov/books/NBK25500/#chapter1.Storing_Search_Results
    

RetMax : 20
RetStart : 0
Count : 30
WebEnv : NCID_1_241490309_130.14.18.125_9001_1548190373_483640861_0MetA0_S_MegaStore
TranslationStack : [DictElement({'Term': 'LPL[All Fields]', 'Field': 'All Fields', 'Count': '21205', 'Explode': 'N'}, attributes={}), DictElement({'Term': 'pathogenic[Clinical_Significance]', 'Field': 'Clinical_Significance', 'Count': '50439', 'Explode': 'N'}, attributes={}), DictElement({'Term': 'missense[Function_Class]', 'Field': 'Function_Class', 'Count': '7578995', 'Explode': 'N'}, attributes={}), 'AND', 'GROUP', 'AND']
QueryTranslation : LPL[All Fields] AND (pathogenic[Clinical_Significance] AND missense[Function_Class])
QueryKey : 1
IdList : ['386571803', '386481815', '118204082', '118204080', '118204079', '118204078', '118204077', '118204076', '118204075', '118204073', '118204072', '118204071', '118204069', '118204068', '118204067', '118204064', '118204063', '118204062', '118204061', '118204060']
TranslationSet : [DictElement({'From': 'LPL[All Fields]', 'To':

In [41]:
# get result RSIDs list 'Idlist'
# total rs count 
rslist = (eSresult['IdList'])

In [42]:
# retmax = 20 so print only 20 RSIDs
# additional results can be retrieved by batches
# download in batches example http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc139 or see below.
for rs in rslist:
    print(rs)

386571803
386481815
118204082
118204080
118204079
118204078
118204077
118204076
118204075
118204073
118204072
118204071
118204069
118204068
118204067
118204064
118204063
118204062
118204061
118204060


In [43]:
# get the WebEnv session cookie, and the QueryKey:

webenv = eSresult["WebEnv"]
query_key = eSresult["QueryKey"]
total_count = int(eSresult["Count"])
query_key = eSresult["QueryKey"]
retmax = 5 # return 5 uids per batch

In [44]:
# sample codes adopted from http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc139.
fetch_count = 0
for start in range(0, total_count, retmax):
    end = min(total_count, start+retmax)
    print("Going to download record %i to %i" % (start+1, end))
    attempt = 0
    #fetch_count += 1
    while (attempt < 3): # & (fetch_count < 2):
        attempt += 1
        try:
            fetch_handle = Entrez.efetch(db="snp",
                                         rettype="uilist", #available types [uilist | docsum (use retmode=xml))
                                         #retmode="xml",
                                         retstart=start,
                                         retmax=retmax,
                                         webenv=webenv,
                                         query_key=query_key )
        except HTTPError as err:
            if 500 <= err.code <= 599:
                print("Received error from server %s" % err)
                print("Attempt %i of 3" % attempt)
                time.sleep(15)
            else:
                raise
    if (fetch_handle):
        #print(fetch_handle)            
        data = fetch_handle.read()
        print(data)
        fetch_handle.close()



Going to download record 1 to 5
386571803
386481815
118204082
118204080
118204079

Going to download record 6 to 10
118204078
118204077
118204076
118204075
118204073

Going to download record 11 to 15
118204072
118204071
118204069
118204068
118204067

Going to download record 16 to 20
118204064
118204063
118204062
118204061
118204060

Going to download record 21 to 25
118204059
118204058
118204057
118204056
52818902

Going to download record 26 to 30
52806281
28934893
17850737
1801177
268

