In [1]:
# install biopython on Jupyter server.
import sys
!pip install biopython

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/d0/5d/8aedf28541f4936707d78997ebe6d9e25935ae6df6b8f7a045ce294df664/biopython-1.73-cp37-cp37m-manylinux1_x86_64.whl (2.2MB)
[K    100% |████████████████████████████████| 2.2MB 911kB/s eta 0:00:01
[?25hCollecting numpy (from biopython)
[?25l  Downloading https://files.pythonhosted.org/packages/91/e7/6c780e612d245cca62bc3ba8e263038f7c144a96a54f877f3714a0e8427e/numpy-1.16.2-cp37-cp37m-manylinux1_x86_64.whl (17.3MB)
[K    100% |████████████████████████████████| 17.3MB 135kB/s eta 0:00:01
[?25hInstalling collected packages: numpy, biopython
Successfully installed biopython-1.73 numpy-1.16.2


In [2]:
import time
from Bio import Entrez

In [20]:
Entrez.email = "dbsnp-user@nih.gov" # provide your user email 
# RECOMMENDED: apply for API key from NCBI (https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/). 
# 10 queries per second with a valid API key, otherwise 3 queries per seconds are allowed for 'None'
Entrez.api_key = None

# entrez query (term) can be build and test online using web query builder (https://www.ncbi.nlm.nih.gov/snp/advanced) 
# esearch handle
eShandle = Entrez.esearch(db="snp",  # search dbSNP
                          term='LPL[All Fields] AND (pathogenic[Clinical_Significance] AND "missense variant"[Function_Class])', # search for gene LPL
                          usehistory="y", #cache result on server for download in batches
                          retmax=20 # return 20 RSID max
                         )


In [21]:

# get esearch result
eSresult = Entrez.read(eShandle)

In [22]:
# review results 
for k in eSresult:
    print (k, ":", eSresult[k])
    
#Output: Web environment (&WebEnv) and query key (&query_key) parameters specifying the location on the Entrez history server of the list of UIDs matching the Entrez query
#https://www.ncbi.nlm.nih.gov/books/NBK25500/#chapter1.Storing_Search_Results
    

Count : 31
RetMax : 20
RetStart : 0
QueryKey : 1
WebEnv : NCID_1_110799343_130.14.22.76_9001_1554912257_1392080876_0MetA0_S_MegaStore
IdList : ['386571803', '386481815', '372668179', '118204082', '118204080', '118204079', '118204078', '118204077', '118204076', '118204075', '118204073', '118204072', '118204071', '118204069', '118204068', '118204067', '118204064', '118204063', '118204062', '118204061']
TranslationSet : [DictElement({'From': 'LPL[All Fields]', 'To': 'LPL[All Fields]'}, attributes={})]
TranslationStack : [DictElement({'Term': 'LPL[All Fields]', 'Field': 'All Fields', 'Count': '21252', 'Explode': 'N'}, attributes={}), DictElement({'Term': 'pathogenic[Clinical_Significance]', 'Field': 'Clinical_Significance', 'Count': '63619', 'Explode': 'N'}, attributes={}), DictElement({'Term': '"missense variant"[Function_Class]', 'Field': 'Function_Class', 'Count': '7678169', 'Explode': 'N'}, attributes={}), 'AND', 'GROUP', 'AND']
QueryTranslation : LPL[All Fields] AND (pathogenic[Clinic

In [23]:
# get result RSIDs list 'Idlist'
# total rs count 
rslist = (eSresult['IdList'])

In [28]:
# retmax = 20 so print only 20 RSIDs
# additional results can be retrieved by batches
# download in batches example http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc139 or see below.
for rs in rslist:
    print(rs)

386571803
386481815
372668179
118204082
118204080
118204079
118204078
118204077
118204076
118204075
118204073
118204072
118204071
118204069
118204068
118204067
118204064
118204063
118204062
118204061


In [31]:
# get the WebEnv session cookie, and the QueryKey:

webenv = eSresult["WebEnv"]
query_key = eSresult["QueryKey"]
total_count = int(eSresult["Count"])
query_key = eSresult["QueryKey"]
retmax = 20 # return 5 uids per batch

In [32]:
# sample codes adopted with modifications from http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc139.
fetch_count = 0
for start in range(0, total_count, retmax):
    end = min(total_count, start+retmax)
    print("Going to download record %i to %i" % (start+1, end))
    attempt = 0
    #fetch_count += 1
    while (attempt < 3):
        attempt += 1
        try:
            fetch_handle = Entrez.efetch(db="snp",
                                         rettype="uilist", #available types [uilist | docsum (use retmode=xml))
                                         #retmode="xml",
                                         retstart=start,
                                         retmax=retmax,
                                         webenv=webenv,
                                         query_key=query_key )
        except HTTPError as err:
            if 500 <= err.code <= 599:
                print("Received error from server %s" % err)
                print("Attempt %i of 3" % attempt)
                time.sleep(15)
            else:
                raise
    if (fetch_handle):
        #print(fetch_handle)            
        data = fetch_handle.read()
        print(data)
        fetch_handle.close()



Going to download record 1 to 20
386571803
386481815
372668179
118204082
118204080
118204079
118204078
118204077
118204076
118204075
118204073
118204072
118204071
118204069
118204068
118204067
118204064
118204063
118204062
118204061

Going to download record 21 to 31
118204060
118204059
118204058
118204057
118204056
52818902
52806281
28934893
17850737
1801177
268

