Permalink
Browse files

Remove pathogen table and unnecessary print statements.

  • Loading branch information...
barneypotter24 committed Nov 17, 2017
1 parent 97d81f6 commit 59a209222dd8564b1fcceae978a3ae7ee6eb5722
Showing with 4 additions and 69 deletions.
  1. +4 −68 src/dataset.py
  2. +0 −1 src/run.py
View
@@ -173,13 +173,8 @@ def clean(self, doc):
# Use functions specified by cfg.py. Fxn defs in cleaning_functions.py
fxns = cfg.sequence_clean
print "Doc:"
print doc
for fxn in fxns:
fxn(doc, None, self.bad_docs, self.metadata['pathogen'])
print str(fxn)
print doc
return doc
@@ -220,6 +215,10 @@ def reshape(self,docs):
##################################################
# Everything south of here should be considered depracated until
# it has been looked over and updated relative to the new JSON spec.
def read_metadata(self, path, metafile, **kwargs):
'''
Read an xml file to a metadata dataset
@@ -278,69 +277,6 @@ def set_sequence_permissions(self, permissions, **kwargs):
for a in self.dataset:
self.dataset[a]['permissions'] = permissions
def compile_pathogen_table(self, subtype, **kwargs):
vs = {}
for pathogen in self.dataset.keys():
# Initialize pathogen dict
name = self.dataset[pathogen]['strain']
if name not in vs.keys():
vs[name] = {'strain' : name }
if 'sequence_names' in vs[name].keys():
vs[name]['sequence_names'].append(self.dataset[pathogen]['sequence_name'])
else:
vs[name]['sequence_names'] = [self.dataset[pathogen]['sequence_name']]
# Scrape pathogen host
# TODO: Resolve issues if there are different hosts
if 'host' not in self.dataset[pathogen].keys():
vs[name]['host'] = 'human'
elif self.dataset[pathogen]['host'] == None:
vs[name]['host'] = 'human'
self.dataset[pathogen].pop('host',None)
else:
vs[name]['host'] = name['host']
self.dataset[pathogen].pop('host',None)
# Scrape host age
# TODO: Resolve issues if there are different ages
if 'age' not in self.dataset[pathogen].keys():
vs[name]['host_age'] = None
elif self.dataset[pathogen]['age'] == None:
vs[name]['host_age'] = None
self.dataset[pathogen].pop('age',None)
else:
vs[name]['host_age'] = name['age']
self.dataset[pathogen].pop('age',None)
# Scrape subtype
if subtype != None:
vs[name]['subtype'] = subtype
elif ('subtype' in self.dataset[pathogen].keys()) and (self.dataset[pathogen]['subtype'] is not None):
vs[name]['subtype'] = self.dataset[pathogen]['subtype']
self.dataset[pathogen].pop('subtype', None)
else:
vs[name]['subtype'] = None
for name in vs.keys():
# Scrape number of segments
segments = set()
for a in vs[name]['sequence_names']:
segments.add(self.dataset[a]['locus'])
vs[name]['number_of_segments'] = len(segments)
# # Scrape isolate ids
# ids = set()
# for a in vs[name]['sequence_names']:
# ids.add(self.dataset[a]['isolate_id'])
# vs[name]['isolate_ids'] = list(ids)
# Placeholder for un_locode
vs[name]['un_locode'] = 'placehoder'
# location = name.split('/')[1]
# vs[name]['un_locode'] = lookup_locode(location) TODO: Write this fxn
self.pathogens = vs
def build_references_table(self):
'''
This is a placeholder function right now, it will build a reference
View
@@ -74,7 +74,6 @@ def list_options(list_pathogens, list_datatypes):
D.clean(key, D.dataset[key])
# D.remove_bad_docs()
print '~~~~~ Cleaned %s documents in %s seconds ~~~~~' % (len(D.dataset), (time.time()-t))
D.compile_pathogen_table(**args.__dict__)
D.build_references_table()
D.set_sequence_permissions(args.permissions)
D.write('%s%s_%s.json' % (args.outpath, args.pathogen, args.datatype))

0 comments on commit 59a2092

Please sign in to comment.