Permalink
Browse files

genbank API interface to run

  • Loading branch information...
jameshadfield committed Nov 3, 2017
1 parent da48efa commit 0d32a75bb285bf10635398efacdcf688732d43f4
Showing with 107 additions and 130 deletions.
  1. +0 −127 src/add_attributions_via_API.py
  2. +62 −0 src/genbank_API.py
  3. +32 −0 src/genbank_parsers.py
  4. +13 −3 src/run.py

This file was deleted.

Oops, something went wrong.
View
@@ -0,0 +1,62 @@
from __future__ import print_function
import os, csv, sys, json, re
import logging
from pdb import set_trace
from Bio import Entrez
from Bio import SeqIO
def query_genbank(accessions, parsers, email=None, retmax=10, n_entrez=10, gbdb="nuccore", **kwargs):
# https://www.biostars.org/p/66921/
logger = logging.getLogger(__name__)
if not email:
email = os.environ['NCBI_EMAIL']
Entrez.email = email
logger.debug("Genbank email: {}".format(email))
# prepare queries for download in chunks no greater than n_entrez
queries = []
for i in sorted(xrange(0, len(accessions), n_entrez)):
queries.append(set(accessions[i:i+n_entrez]))
def esearch(accs):
if len(accs) == 0:
logger.debug("No accessions left to search")
return
logger.debug("esearch with {}".format(accs))
list_accs = list(accs)
res = Entrez.read(Entrez.esearch(db=gbdb, term=" ".join(list_accs), retmax=retmax))
if "ErrorList" in res:
not_found = res["ErrorList"]["PhraseNotFound"][0]
logger.debug("esearch failed - accession {} doesn't exist. Retrying".format(not_found))
accs.remove(not_found)
esearch(accs)
else: # success :)
for i, x in enumerate(list_accs):
acc_gi_map[x] = res["IdList"][i]
# populate Accession -> GI number via entrez esearch
acc_gi_map = {x:None for x in accessions}
for qq in queries:
esearch(qq)
gi_numbers = [x for x in acc_gi_map.values() if x != None]
# get entrez data vie efetch
logger.debug("Getting epost results for {}".format(gi_numbers))
try:
search_handle = Entrez.epost(db=gbdb, id=",".join(gi_numbers))
search_results = Entrez.read(search_handle)
webenv, query_key = search_results["WebEnv"], search_results["QueryKey"]
except:
logger.critical("ERROR: Couldn't connect with entrez, please run again")
sys.exit(2)
for start in range(0, len(gi_numbers), retmax):
#fetch entries in batch
try:
handle = Entrez.efetch(db=gbdb, rettype="gb", retstart=start, retmax=retmax, webenv=webenv, query_key=query_key)
except IOError:
logger.critical("ERROR: Couldn't connect with entrez, please run again")
else:
SeqIO_records = SeqIO.parse(handle, "genbank")
for record in SeqIO_records:
for parser in parsers:
parser(record, **kwargs)
View
@@ -0,0 +1,32 @@
import logging
import re
def extract_attributions(data, record, **kwargs):
logger = logging.getLogger(__name__)
accession = re.match(r'^([^.]*)', record.id).group(0).upper() # get everything before the '.'?
if accession not in data:
logger.warn("Error with accession {}".format(accession))
return
references = record.annotations["references"]
if len(references):
# is there a reference which is not a "Direct Submission"?
data[accession] = {}
titles = [reference.title for reference in references]
try:
idx = [i for i, j in enumerate(titles) if j is not None and j != "Direct Submission"][0]
except IndexError: # fall back to direct submission
idx = [i for i, j in enumerate(titles) if j is not None][0]
reference = references[idx] # <class 'Bio.SeqFeature.Reference'>
keys = reference.__dict__.keys()
data[accession]['title'] = reference.title
if "authors" in keys and reference.authors is not None:
first_author = re.match(r'^([^,]*)', reference.authors).group(0)
data[accession]['authors'] = first_author + " et al"
if "journal" in keys and reference.journal is not None:
data[accession]['journal'] = reference.journal
if "pubmed_id" in keys and reference.pubmed_id is not None:
data[accession]["puburl"] = "https://www.ncbi.nlm.nih.gov/pubmed/" + reference.pubmed_id
logger.debug("{} -> {}".format(accession, data[accession]))
else:
logger.debug("{} had no references".format(accession))
View
@@ -1,11 +1,15 @@
from dataset import Dataset
from genbank_API import query_genbank
from genbank_parsers import extract_attributions
import cfg as cfg
import argparse
import os, sys
from pdb import set_trace
import logging
from functools import partial
sys.path.append('')
def assert_valid_input(virus, datatype, path, outpath, infiles, source, subtype, ftype, **kwargs):
'''
Make sure that all the given arguments are valid.
@@ -52,6 +56,8 @@ def list_options(list_viruses, list_datatypes):
parser.add_argument('--permissions', default='public', help='permissions level for documents in JSON')
parser.add_argument('--test', default=False, action='store_true', help='test run for debugging') # Remove this at some point.
parser.add_argument("--debug", action="store_const", dest="loglevel", const=logging.DEBUG, help="Enable debugging logging")
parser.add_argument("--update_attributions_via_genbank", action="store_true", default=False, help="Use the genbank API to fill in attributions for accession numbers")
## there will be heaps of arguments here (about 15 just for genbank API) - we should look into argument grouping
args = parser.parse_args()
list_options(args.list_viruses, args.list_datatypes)
@@ -66,8 +72,12 @@ def list_options(list_viruses, list_datatypes):
if args.test:
D = Dataset(**args.__dict__)
D.set_sequence_permissions(args.permissions)
## not sure of the best interface here...
if (args.update_attributions_via_genbank):
set_trace()
## not sure of the best interface here...
accessions = D.dataset.keys() # this will change
attributions = {x: None for x in accessions}
extract_attributions_bound = partial(extract_attributions, attributions)
query_genbank(accessions=accessions, parsers=[extract_attributions_bound], **vars(args))
## now merge attributions into D
## i'm leaving this for now as the schema is changing
D.write('%s%s_%s.json' % (args.outpath, args.virus, args.datatype))

0 comments on commit 0d32a75

Please sign in to comment.