Permalink
Browse files

Merge pull request #7 from nextstrain/genbank

Genbank API for reference lookup
  • Loading branch information...
barneypotter24 committed Nov 3, 2017
2 parents 57cddfe + 0d32a75 commit d01da681eb378ae249c165ec2d806ef02d45b30b
Showing with 146 additions and 14 deletions.
  1. +4 −2 src/cfg.py
  2. +18 −7 src/cleaning_functions.py
  3. +6 −4 src/dataset.py
  4. +62 −0 src/genbank_API.py
  5. +32 −0 src/genbank_parsers.py
  6. +24 −1 src/run.py
View
@@ -2,7 +2,7 @@
from cleaning_functions import *
### Acceptable parameters ###
viruses = [ 'seasonal_flu' ]
viruses = [ 'seasonal_flu', 'piglets' ]
subtypes = { 'seasonal_flu': [ 'h3n2', 'h1n1pdm', 'vic', 'yam' ] }
datatypes = [ 'titer', 'sequence', 'virus' ]
filetypes = [ 'fasta' ]
@@ -14,13 +14,15 @@
### Mappings used by sacra ###
# Lists sources from which different datatypes come from
sources = { 'sequence' : [ 'gisaid', 'fauna', 'vipr' ],
sources = { 'sequence' : [ 'gisaid', 'fauna', 'fauna_mumps', 'vipr' ], ## duplication of keys in fasta_headers
'titer' : [ 'crick', 'cdc' ] }
# For each sequence source, the default order of fields in the fasta header
fasta_headers = { 'gisaid' : [ 'accession', 'strain', 'isolate_id', 'locus', 'passage', 'submitting_lab' ],
'fauna' : [ 'strain', 'virus', 'accession', 'collection_date', 'region', 'country', 'division', 'location', 'passage', 'source', 'age' ],
'fauna_mumps' : [ 'strain', 'virus', 'accession', 'collection_date', 'country', 'division', 'muv_genotype', 'host', 'authors', 'publication_name', 'journal', 'attribution_url', 'accession_url' ],
'vipr': [ 'accession', 'strain', 'locus', 'date', 'host', 'country', 'subtype', 'virus' ] }
metadata_fields = set( [ 'isolate_id', 'subtype', 'submitting_lab', 'passage_history', 'location', 'collection_date' ] )
required_fields = { 'sequence' : { 'strain', 'date', 'accession', 'source', 'locus', 'sequence', 'isolate_id' } }
optional_fields = { 'sequence': { 'strain', 'date', 'passage_category', 'source', 'submitting_lab',
View
@@ -1,6 +1,8 @@
import re, sys
import cfg
import csv
import logging
sys.path.append('')
# Cleaning functions that will clean the data in a dataset object.
# These are kept separate from class functions to make it easier for the user to
@@ -22,7 +24,7 @@ def fix_accession(doc, key, remove, *args):
'''
if 'accession' in doc and doc['accession'] is not None:
doc['accession'] = doc['accession'].encode('ascii', 'replace')
doc['accession'] = doc['accession'].lower()
# doc['accession'] = doc['accession'].lower() # revisit this!
if doc['accession'].startswith('epi'):
doc['accession'] = doc['accession'][2:]
@@ -41,8 +43,9 @@ def fix_locus(doc, key, remove, *args):
'''
if 'locus' in doc and doc['locus'] is not None:
doc['locus'] = doc['locus'].lower()
else:
remove.append(key)
# commented out as if the header didn't have this it would remove all documents!
# else:
# remove.append(key)
def fix_strain(doc, key, remove, *args):
'''
@@ -69,12 +72,15 @@ def fix_submitting_lab(doc, key, remove, *args):
'''
Moved to cleaning_functions/fix/submitting_lab.py
'''
logger = logging.getLogger(__name__)
if 'submitting_lab' in doc and doc['submitting_lab'] is not None:
if doc['submitting_lab'] == 'CentersforDiseaseControlandPrevention':
doc['submitting_lab'] = 'CentersForDiseaseControlAndPrevention'
doc['submitting_lab'] = camelcase_to_snakecase(doc['submitting_lab'])
else:
remove.append(key)
# commented out as if the header didn't have this it would remove all documents!
# else:
# logger.warn("Dropping {} - bad submitting lab".format(doc['strain']))
# remove.append(key)
def fix_age(doc, *args):
'''
@@ -169,8 +175,13 @@ def camelcase_to_snakecase(name):
## Moved to cleaning_functions/fix/strain.py
def format_names(docs, virus):
fix_whole_name = define_strain_fixes(cfg.strain_fix_fname[virus])
label_to_fix = define_location_fixes(cfg.label_fix_fname[virus])
logger = logging.getLogger(__name__)
try:
fix_whole_name = define_strain_fixes(cfg.strain_fix_fname[virus])
label_to_fix = define_location_fixes(cfg.label_fix_fname[virus])
except KeyError:
logger.info("Skipping format_names as files not found")
return
for doc in docs:
# Fix this when switching docs to dict
for key in doc:
View
@@ -1,6 +1,7 @@
import os, time, datetime, csv, sys, json
import cfg
from Bio import SeqIO
from pdb import set_trace
sys.path.append('')
class Dataset:
@@ -227,7 +228,7 @@ def compile_virus_table(self, subtype, **kwargs):
vs[name]['host'] = 'human'
self.dataset[virus].pop('host',None)
else:
vs[name]['host'] = name['host']
vs[name]['host'] = self.dataset[virus]['host']
self.dataset[virus].pop('host',None)
# Scrape host age
@@ -253,9 +254,10 @@ def compile_virus_table(self, subtype, **kwargs):
for name in vs.keys():
# Scrape number of segments
segments = set()
for a in vs[name]['accessions']:
segments.add(self.dataset[a]['locus'])
vs[name]['number_of_segments'] = len(segments)
for acc in vs[name]['accessions']:
if 'locus' in self.dataset[acc]:
segments.add(self.dataset[acc]['locus'])
vs[name]['number_of_segments'] = len(segments) if len(segments) else 1
# # Scrape isolate ids
# ids = set()
View
@@ -0,0 +1,62 @@
from __future__ import print_function
import os, csv, sys, json, re
import logging
from pdb import set_trace
from Bio import Entrez
from Bio import SeqIO
def query_genbank(accessions, parsers, email=None, retmax=10, n_entrez=10, gbdb="nuccore", **kwargs):
# https://www.biostars.org/p/66921/
logger = logging.getLogger(__name__)
if not email:
email = os.environ['NCBI_EMAIL']
Entrez.email = email
logger.debug("Genbank email: {}".format(email))
# prepare queries for download in chunks no greater than n_entrez
queries = []
for i in sorted(xrange(0, len(accessions), n_entrez)):
queries.append(set(accessions[i:i+n_entrez]))
def esearch(accs):
if len(accs) == 0:
logger.debug("No accessions left to search")
return
logger.debug("esearch with {}".format(accs))
list_accs = list(accs)
res = Entrez.read(Entrez.esearch(db=gbdb, term=" ".join(list_accs), retmax=retmax))
if "ErrorList" in res:
not_found = res["ErrorList"]["PhraseNotFound"][0]
logger.debug("esearch failed - accession {} doesn't exist. Retrying".format(not_found))
accs.remove(not_found)
esearch(accs)
else: # success :)
for i, x in enumerate(list_accs):
acc_gi_map[x] = res["IdList"][i]
# populate Accession -> GI number via entrez esearch
acc_gi_map = {x:None for x in accessions}
for qq in queries:
esearch(qq)
gi_numbers = [x for x in acc_gi_map.values() if x != None]
# get entrez data vie efetch
logger.debug("Getting epost results for {}".format(gi_numbers))
try:
search_handle = Entrez.epost(db=gbdb, id=",".join(gi_numbers))
search_results = Entrez.read(search_handle)
webenv, query_key = search_results["WebEnv"], search_results["QueryKey"]
except:
logger.critical("ERROR: Couldn't connect with entrez, please run again")
sys.exit(2)
for start in range(0, len(gi_numbers), retmax):
#fetch entries in batch
try:
handle = Entrez.efetch(db=gbdb, rettype="gb", retstart=start, retmax=retmax, webenv=webenv, query_key=query_key)
except IOError:
logger.critical("ERROR: Couldn't connect with entrez, please run again")
else:
SeqIO_records = SeqIO.parse(handle, "genbank")
for record in SeqIO_records:
for parser in parsers:
parser(record, **kwargs)
View
@@ -0,0 +1,32 @@
import logging
import re
def extract_attributions(data, record, **kwargs):
logger = logging.getLogger(__name__)
accession = re.match(r'^([^.]*)', record.id).group(0).upper() # get everything before the '.'?
if accession not in data:
logger.warn("Error with accession {}".format(accession))
return
references = record.annotations["references"]
if len(references):
# is there a reference which is not a "Direct Submission"?
data[accession] = {}
titles = [reference.title for reference in references]
try:
idx = [i for i, j in enumerate(titles) if j is not None and j != "Direct Submission"][0]
except IndexError: # fall back to direct submission
idx = [i for i, j in enumerate(titles) if j is not None][0]
reference = references[idx] # <class 'Bio.SeqFeature.Reference'>
keys = reference.__dict__.keys()
data[accession]['title'] = reference.title
if "authors" in keys and reference.authors is not None:
first_author = re.match(r'^([^,]*)', reference.authors).group(0)
data[accession]['authors'] = first_author + " et al"
if "journal" in keys and reference.journal is not None:
data[accession]['journal'] = reference.journal
if "pubmed_id" in keys and reference.pubmed_id is not None:
data[accession]["puburl"] = "https://www.ncbi.nlm.nih.gov/pubmed/" + reference.pubmed_id
logger.debug("{} -> {}".format(accession, data[accession]))
else:
logger.debug("{} had no references".format(accession))
View
@@ -1,9 +1,15 @@
from dataset import Dataset
from genbank_API import query_genbank
from genbank_parsers import extract_attributions
import cfg as cfg
import argparse
import os, sys
from pdb import set_trace
import logging
from functools import partial
sys.path.append('')
def assert_valid_input(virus, datatype, path, outpath, infiles, source, subtype, ftype, **kwargs):
'''
Make sure that all the given arguments are valid.
@@ -36,7 +42,7 @@ def list_options(list_viruses, list_datatypes):
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-v', '--virus', default='seasonal_flu', type=str, help='virus type to be processed; default is seasonal_flu')
parser.add_argument('-v', '--virus', required=True, type=str, help='virus type to be processed; default is seasonal_flu')
parser.add_argument('-d', '--datatype', default='sequence', type=str, help='type of data being input; default is \"sequence\", other options are \"virus\" or \"titer\"')
parser.add_argument('-p', '--path', default='data/', type=str, help='path to input file(s), default is \"data/\"')
parser.add_argument('-m', '--metafile', default=None, type=str, help='name of file containing virus metadata')
@@ -49,12 +55,29 @@ def list_options(list_viruses, list_datatypes):
parser.add_argument('--list_datatypes', default=False, action='store_true', help='list all supported datatypes and exit')
parser.add_argument('--permissions', default='public', help='permissions level for documents in JSON')
parser.add_argument('--test', default=False, action='store_true', help='test run for debugging') # Remove this at some point.
parser.add_argument("--debug", action="store_const", dest="loglevel", const=logging.DEBUG, help="Enable debugging logging")
parser.add_argument("--update_attributions_via_genbank", action="store_true", default=False, help="Use the genbank API to fill in attributions for accession numbers")
## there will be heaps of arguments here (about 15 just for genbank API) - we should look into argument grouping
args = parser.parse_args()
list_options(args.list_viruses, args.list_datatypes)
assert_valid_input(**args.__dict__)
## set up logger - it can now be used anywhere simply via
## logger = logging.getLogger(__name__)
if not args.loglevel: args.loglevel = logging.INFO
logging.basicConfig(level=args.loglevel, format='%(asctime)-15s %(message)s')
logger = logging.getLogger(__name__)
if args.test:
D = Dataset(**args.__dict__)
D.set_sequence_permissions(args.permissions)
## not sure of the best interface here...
if (args.update_attributions_via_genbank):
accessions = D.dataset.keys() # this will change
attributions = {x: None for x in accessions}
extract_attributions_bound = partial(extract_attributions, attributions)
query_genbank(accessions=accessions, parsers=[extract_attributions_bound], **vars(args))
## now merge attributions into D
## i'm leaving this for now as the schema is changing
D.write('%s%s_%s.json' % (args.outpath, args.virus, args.datatype))

0 comments on commit d01da68

Please sign in to comment.