Permalink
Browse files

allow different fasta header format & fix resulting bugs

  • Loading branch information...
jameshadfield committed Nov 3, 2017
1 parent 7ff1b58 commit da48efa5b53babcda71cc95a91307ec0ad082b24
Showing with 42 additions and 14 deletions.
  1. +4 −2 src/cfg.py
  2. +18 −7 src/cleaning_functions.py
  3. +6 −4 src/dataset.py
  4. +14 −1 src/run.py
View
@@ -2,7 +2,7 @@
from cleaning_functions import *
### Acceptable parameters ###
viruses = [ 'seasonal_flu' ]
viruses = [ 'seasonal_flu', 'piglets' ]
subtypes = { 'seasonal_flu': [ 'h3n2', 'h1n1pdm', 'vic', 'yam' ] }
datatypes = [ 'titer', 'sequence', 'virus' ]
filetypes = [ 'fasta' ]
@@ -14,13 +14,15 @@
### Mappings used by sacra ###
# Lists sources from which different datatypes come from
sources = { 'sequence' : [ 'gisaid', 'fauna', 'vipr' ],
sources = { 'sequence' : [ 'gisaid', 'fauna', 'fauna_mumps', 'vipr' ], ## duplication of keys in fasta_headers
'titer' : [ 'crick', 'cdc' ] }
# For each sequence source, the default order of fields in the fasta header
fasta_headers = { 'gisaid' : [ 'accession', 'strain', 'isolate_id', 'locus', 'passage', 'submitting_lab' ],
'fauna' : [ 'strain', 'virus', 'accession', 'collection_date', 'region', 'country', 'division', 'location', 'passage', 'source', 'age' ],
'fauna_mumps' : [ 'strain', 'virus', 'accession', 'collection_date', 'country', 'division', 'muv_genotype', 'host', 'authors', 'publication_name', 'journal', 'attribution_url', 'accession_url' ],
'vipr': [ 'accession', 'strain', 'locus', 'date', 'host', 'country', 'subtype', 'virus' ] }
metadata_fields = set( [ 'isolate_id', 'subtype', 'submitting_lab', 'passage_history', 'location', 'collection_date' ] )
required_fields = { 'sequence' : { 'strain', 'date', 'accession', 'source', 'locus', 'sequence', 'isolate_id' } }
optional_fields = { 'sequence': { 'strain', 'date', 'passage_category', 'source', 'submitting_lab',
View
@@ -1,6 +1,8 @@
import re, sys
import cfg
import csv
import logging
sys.path.append('')
# Cleaning functions that will clean the data in a dataset object.
# These are kept separate from class functions to make it easier for the user to
@@ -22,7 +24,7 @@ def fix_accession(doc, key, remove, *args):
'''
if 'accession' in doc and doc['accession'] is not None:
doc['accession'] = doc['accession'].encode('ascii', 'replace')
doc['accession'] = doc['accession'].lower()
# doc['accession'] = doc['accession'].lower() # revisit this!
if doc['accession'].startswith('epi'):
doc['accession'] = doc['accession'][2:]
@@ -41,8 +43,9 @@ def fix_locus(doc, key, remove, *args):
'''
if 'locus' in doc and doc['locus'] is not None:
doc['locus'] = doc['locus'].lower()
else:
remove.append(key)
# commented out as if the header didn't have this it would remove all documents!
# else:
# remove.append(key)
def fix_strain(doc, key, remove, *args):
'''
@@ -69,12 +72,15 @@ def fix_submitting_lab(doc, key, remove, *args):
'''
Moved to cleaning_functions/fix/submitting_lab.py
'''
logger = logging.getLogger(__name__)
if 'submitting_lab' in doc and doc['submitting_lab'] is not None:
if doc['submitting_lab'] == 'CentersforDiseaseControlandPrevention':
doc['submitting_lab'] = 'CentersForDiseaseControlAndPrevention'
doc['submitting_lab'] = camelcase_to_snakecase(doc['submitting_lab'])
else:
remove.append(key)
# commented out as if the header didn't have this it would remove all documents!
# else:
# logger.warn("Dropping {} - bad submitting lab".format(doc['strain']))
# remove.append(key)
def fix_age(doc, *args):
'''
@@ -169,8 +175,13 @@ def camelcase_to_snakecase(name):
## Moved to cleaning_functions/fix/strain.py
def format_names(docs, virus):
fix_whole_name = define_strain_fixes(cfg.strain_fix_fname[virus])
label_to_fix = define_location_fixes(cfg.label_fix_fname[virus])
logger = logging.getLogger(__name__)
try:
fix_whole_name = define_strain_fixes(cfg.strain_fix_fname[virus])
label_to_fix = define_location_fixes(cfg.label_fix_fname[virus])
except KeyError:
logger.info("Skipping format_names as files not found")
return
for doc in docs:
# Fix this when switching docs to dict
for key in doc:
View
@@ -1,6 +1,7 @@
import os, time, datetime, csv, sys, json
import cfg
from Bio import SeqIO
from pdb import set_trace
sys.path.append('')
class Dataset:
@@ -227,7 +228,7 @@ def compile_virus_table(self, subtype, **kwargs):
vs[name]['host'] = 'human'
self.dataset[virus].pop('host',None)
else:
vs[name]['host'] = name['host']
vs[name]['host'] = self.dataset[virus]['host']
self.dataset[virus].pop('host',None)
# Scrape host age
@@ -253,9 +254,10 @@ def compile_virus_table(self, subtype, **kwargs):
for name in vs.keys():
# Scrape number of segments
segments = set()
for a in vs[name]['accessions']:
segments.add(self.dataset[a]['locus'])
vs[name]['number_of_segments'] = len(segments)
for acc in vs[name]['accessions']:
if 'locus' in self.dataset[acc]:
segments.add(self.dataset[acc]['locus'])
vs[name]['number_of_segments'] = len(segments) if len(segments) else 1
# # Scrape isolate ids
# ids = set()
View
@@ -2,6 +2,8 @@
import cfg as cfg
import argparse
import os, sys
from pdb import set_trace
import logging
sys.path.append('')
def assert_valid_input(virus, datatype, path, outpath, infiles, source, subtype, ftype, **kwargs):
@@ -36,7 +38,7 @@ def list_options(list_viruses, list_datatypes):
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-v', '--virus', default='seasonal_flu', type=str, help='virus type to be processed; default is seasonal_flu')
parser.add_argument('-v', '--virus', required=True, type=str, help='virus type to be processed; default is seasonal_flu')
parser.add_argument('-d', '--datatype', default='sequence', type=str, help='type of data being input; default is \"sequence\", other options are \"virus\" or \"titer\"')
parser.add_argument('-p', '--path', default='data/', type=str, help='path to input file(s), default is \"data/\"')
parser.add_argument('-m', '--metafile', default=None, type=str, help='name of file containing virus metadata')
@@ -49,12 +51,23 @@ def list_options(list_viruses, list_datatypes):
parser.add_argument('--list_datatypes', default=False, action='store_true', help='list all supported datatypes and exit')
parser.add_argument('--permissions', default='public', help='permissions level for documents in JSON')
parser.add_argument('--test', default=False, action='store_true', help='test run for debugging') # Remove this at some point.
parser.add_argument("--debug", action="store_const", dest="loglevel", const=logging.DEBUG, help="Enable debugging logging")
args = parser.parse_args()
list_options(args.list_viruses, args.list_datatypes)
assert_valid_input(**args.__dict__)
## set up logger - it can now be used anywhere simply via
## logger = logging.getLogger(__name__)
if not args.loglevel: args.loglevel = logging.INFO
logging.basicConfig(level=args.loglevel, format='%(asctime)-15s %(message)s')
logger = logging.getLogger(__name__)
if args.test:
D = Dataset(**args.__dict__)
D.set_sequence_permissions(args.permissions)
if (args.update_attributions_via_genbank):
set_trace()
## not sure of the best interface here...
D.write('%s%s_%s.json' % (args.outpath, args.virus, args.datatype))

0 comments on commit da48efa

Please sign in to comment.