Permalink
Browse files

Resolve merge conflicts from master into cleaning_functions

  • Loading branch information...
barneypotter24 committed Nov 17, 2017
2 parents c7a21e9 + a33965c commit 08d2bb142db4dfcba5ccccf406acab74f5c5f710
View

This file was deleted.

Oops, something went wrong.
View
@@ -2,7 +2,7 @@
from old_cleaning_functions import *
### Acceptable parameters ###
pathogens = [ 'seasonal_flu' ]
viruses = [ 'seasonal_flu', 'piglets' ]
subtypes = { 'seasonal_flu': [ 'h3n2', 'h1n1pdm', 'vic', 'yam' ] }
datatypes = [ 'titer', 'sequence', 'pathogen' ]
filetypes = [ 'fasta' ]
@@ -14,14 +14,15 @@
### Mappings used by sacra ###
# Lists sources from which different datatypes come from
sources = { 'sequence' : [ 'gisaid', 'fauna', 'vipr' ],
sources = { 'sequence' : [ 'gisaid', 'fauna', 'fauna_mumps', 'vipr' ], ## duplication of keys in fasta_headers
'titer' : [ 'crick', 'cdc' ] }
##### strain_sample from https://github.com/nextstrain/sacra/blob/schema/schema/schema_zika.json#L100
# For each sequence source, the default order of fields in the fasta header
fasta_headers = { 'gisaid' : [ 'sequence_name', 'strain_name', 'sample_name', 'locus', 'passage', 'sequencing_lab' ],
'fauna' : [ 'strain', 'pathogen', 'sequence_name', 'collection_date', 'region', 'country', 'division', 'location', 'passage', 'source', 'age' ],
'vipr': [ 'sequence_name', 'strain', 'locus', 'date', 'host', 'country', 'subtype', 'pathogen' ] }
metadata_fields = set( [ 'isolate_id', 'subtype', 'submitting_lab', 'passage_history', 'location', 'collection_date' ] )
required_fields = { 'sequence' : { 'strain', 'date', 'sequence_name', 'source', 'locus', 'sequence', 'isolate_id' } }
optional_fields = { 'sequence': { 'strain', 'date', 'passage_category', 'source', 'submitting_lab',
View
@@ -1,6 +1,7 @@
import os, time, datetime, csv, sys, json
import cfg
from Bio import SeqIO
from pdb import set_trace
sys.path.append('')
# sys.path.append('src/')
# import cleaning_functions as c
View
@@ -0,0 +1,62 @@
from __future__ import print_function
import os, csv, sys, json, re
import logging
from pdb import set_trace
from Bio import Entrez
from Bio import SeqIO
def query_genbank(accessions, parsers, email=None, retmax=10, n_entrez=10, gbdb="nuccore", **kwargs):
# https://www.biostars.org/p/66921/
logger = logging.getLogger(__name__)
if not email:
email = os.environ['NCBI_EMAIL']
Entrez.email = email
logger.debug("Genbank email: {}".format(email))
# prepare queries for download in chunks no greater than n_entrez
queries = []
for i in sorted(xrange(0, len(accessions), n_entrez)):
queries.append(set(accessions[i:i+n_entrez]))
def esearch(accs):
if len(accs) == 0:
logger.debug("No accessions left to search")
return
logger.debug("esearch with {}".format(accs))
list_accs = list(accs)
res = Entrez.read(Entrez.esearch(db=gbdb, term=" ".join(list_accs), retmax=retmax))
if "ErrorList" in res:
not_found = res["ErrorList"]["PhraseNotFound"][0]
logger.debug("esearch failed - accession {} doesn't exist. Retrying".format(not_found))
accs.remove(not_found)
esearch(accs)
else: # success :)
for i, x in enumerate(list_accs):
acc_gi_map[x] = res["IdList"][i]
# populate Accession -> GI number via entrez esearch
acc_gi_map = {x:None for x in accessions}
for qq in queries:
esearch(qq)
gi_numbers = [x for x in acc_gi_map.values() if x != None]
# get entrez data vie efetch
logger.debug("Getting epost results for {}".format(gi_numbers))
try:
search_handle = Entrez.epost(db=gbdb, id=",".join(gi_numbers))
search_results = Entrez.read(search_handle)
webenv, query_key = search_results["WebEnv"], search_results["QueryKey"]
except:
logger.critical("ERROR: Couldn't connect with entrez, please run again")
sys.exit(2)
for start in range(0, len(gi_numbers), retmax):
#fetch entries in batch
try:
handle = Entrez.efetch(db=gbdb, rettype="gb", retstart=start, retmax=retmax, webenv=webenv, query_key=query_key)
except IOError:
logger.critical("ERROR: Couldn't connect with entrez, please run again")
else:
SeqIO_records = SeqIO.parse(handle, "genbank")
for record in SeqIO_records:
for parser in parsers:
parser(record, **kwargs)
View
@@ -0,0 +1,32 @@
import logging
import re
def extract_attributions(data, record, **kwargs):
logger = logging.getLogger(__name__)
accession = re.match(r'^([^.]*)', record.id).group(0).upper() # get everything before the '.'?
if accession not in data:
logger.warn("Error with accession {}".format(accession))
return
references = record.annotations["references"]
if len(references):
# is there a reference which is not a "Direct Submission"?
data[accession] = {}
titles = [reference.title for reference in references]
try:
idx = [i for i, j in enumerate(titles) if j is not None and j != "Direct Submission"][0]
except IndexError: # fall back to direct submission
idx = [i for i, j in enumerate(titles) if j is not None][0]
reference = references[idx] # <class 'Bio.SeqFeature.Reference'>
keys = reference.__dict__.keys()
data[accession]['title'] = reference.title
if "authors" in keys and reference.authors is not None:
first_author = re.match(r'^([^,]*)', reference.authors).group(0)
data[accession]['authors'] = first_author + " et al"
if "journal" in keys and reference.journal is not None:
data[accession]['journal'] = reference.journal
if "pubmed_id" in keys and reference.pubmed_id is not None:
data[accession]["puburl"] = "https://www.ncbi.nlm.nih.gov/pubmed/" + reference.pubmed_id
logger.debug("{} -> {}".format(accession, data[accession]))
else:
logger.debug("{} had no references".format(accession))
@@ -1,6 +1,8 @@
import re, sys
import cfg
import csv
import logging
sys.path.append('')
# Cleaning functions that will clean the data in a dataset object.
# These are kept separate from class functions to make it easier for the user to
@@ -41,8 +43,9 @@ def fix_locus(doc, key, remove, *args):
'''
if 'locus' in doc and doc['locus'] is not None:
doc['locus'] = doc['locus'].lower()
else:
remove.append(key)
# commented out as if the header didn't have this it would remove all documents!
# else:
# remove.append(key)
def fix_strain_name(doc, key, remove, *args):
'''
@@ -69,12 +72,15 @@ def fix_submitting_lab(doc, key, remove, *args):
'''
Moved to cleaning_functions/fix/submitting_lab.py
'''
logger = logging.getLogger(__name__)
if 'submitting_lab' in doc and doc['submitting_lab'] is not None:
if doc['submitting_lab'] == 'CentersforDiseaseControlandPrevention':
doc['submitting_lab'] = 'CentersForDiseaseControlAndPrevention'
doc['submitting_lab'] = camelcase_to_snakecase(doc['submitting_lab'])
else:
remove.append(key)
# commented out as if the header didn't have this it would remove all documents!
# else:
# logger.warn("Dropping {} - bad submitting lab".format(doc['strain']))
# remove.append(key)
def fix_age(doc, *args):
'''
View
@@ -1,4 +1,6 @@
from dataset import Dataset
from genbank_API import query_genbank
from genbank_parsers import extract_attributions
import cfg as cfg
import argparse
import os, sys, time
@@ -47,11 +49,20 @@ def list_options(list_pathogens, list_datatypes):
parser.add_argument('--list_datatypes', default=False, action='store_true', help='list all supported datatypes and exit')
parser.add_argument('--permissions', default='public', help='permissions level for documents in JSON')
parser.add_argument('--test', default=False, action='store_true', help='test run for debugging') # Remove this at some point.
parser.add_argument("--debug", action="store_const", dest="loglevel", const=logging.DEBUG, help="Enable debugging logging")
parser.add_argument("--update_attributions_via_genbank", action="store_true", default=False, help="Use the genbank API to fill in attributions for accession numbers")
## there will be heaps of arguments here (about 15 just for genbank API) - we should look into argument grouping
args = parser.parse_args()
list_options(args.list_pathogens, args.list_datatypes)
assert_valid_input(**args.__dict__)
## set up logger - it can now be used anywhere simply via
## logger = logging.getLogger(__name__)
if not args.loglevel: args.loglevel = logging.INFO
logging.basicConfig(level=args.loglevel, format='%(asctime)-15s %(message)s')
logger = logging.getLogger(__name__)
if args.test:
D = Dataset(**args.__dict__)
# TODO: Add abstraction layer to read_data_files()

0 comments on commit 08d2bb1

Please sign in to comment.