Merge branch 'cleaning_functions'

* cleaning_functions: Update field names to match with spec and bug fixes. Update call to spec and change virus references to pathogen. Add reshape function to dataset class. Move Sacra spec to separate file. Update config with mapping of tables to fields. All fields must be unique. Move file reading/cleaning to cleaner structure. Move cleaning functions to nested directories for each field in a doc.
nextstrain · Nov 17, 2017 · 97d81f6 · 97d81f6
2 parents a33965c + 08d2bb1
commit 97d81f6
Show file tree

Hide file tree

Showing 14 changed files with 323 additions and 256 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,5 +15,5 @@ __pycache__
 # Code duplicated for python3
 2to3/*
 
-# Other
+# Note files
 notes.txt
diff --git a/README.md b/README.md
@@ -29,8 +29,8 @@ This will write a JSON to the `output` directory.
 If uploading multiple files is necessary, the call can be altered to: `python src/run.py --infiles split_file_1.fasta split_file_2.fasta --source gisaid --test`.
 
 ## Explanation of options
-- `-v`, `--virus`:
-  - Virus species that will be processed in the dataset run. To avoid errors, this should be present in `src/cfg.py`.
+- `-v`, `--pathogen`:
+  - pathogen species that will be processed in the dataset run. To avoid errors, this should be present in `src/cfg.py`.
   - _Default:_ `seasonal_flu`
 - `-d`, `--datatype`:
   - Type of data that will be processed (i.e. sequence, titer, epi). To avoid errors, this should be present in `src/cfg.py`
@@ -50,9 +50,9 @@ If uploading multiple files is necessary, the call can be altered to: `python sr
 - `--source`:
   - Source from which the data came. Used in `src/cfg.py` to specify parameters specific to a given source (i.e. order of metadata in a FASTA header)
 - `--subtype`:
-  - Subtype of a given virus, if known.
--  `--list_viruses`:
-  - Lists all supported viruses and exits.
+  - Subtype of a given pathogen, if known.
+-  `--list_pathogens`:
+  - Lists all supported pathogens and exits.
 - `--list_datatypes`:
   - Lists all supported datatypes and exits.
 

diff --git a/database-ideas.md b/database-ideas.md
@@ -3,6 +3,6 @@
 ### Modification of current fauna
 
 User databases:
-  <virus>\_<datatype> tables
+  <pathogen>\_<datatype> tables
 
 each <datatype> has it's own primary key/optional columns
diff --git a/src/cfg.py b/src/cfg.py
@@ -1,32 +1,32 @@
 # TODO, This needs to be reformatted to be more user friendly
-from cleaning_functions import *
+from old_cleaning_functions import *
 
 ### Acceptable parameters ###
 viruses = [ 'seasonal_flu', 'piglets' ]
 subtypes = { 'seasonal_flu': [ 'h3n2', 'h1n1pdm', 'vic', 'yam' ] }
-datatypes = [ 'titer', 'sequence', 'virus' ]
+datatypes = [ 'titer', 'sequence', 'pathogen' ]
 filetypes = [ 'fasta' ]
 
 ### Cleaning functions for different datatypes ###
 # Functions should be defined in cleaning_functions.py
-virus_clean = []
-sequence_clean = [ fix_accession, fix_sequence, fix_locus, fix_strain, remove_isolate_id, fix_passage, fix_submitting_lab, fix_age, determine_passage_category ]
+pathogen_clean = []
+sequence_clean = [ fix_sequence_name, fix_sequence, fix_locus, fix_strain_name, fix_passage, fix_submitting_lab, fix_age, determine_passage_category ]
 
 ### Mappings used by sacra ###
 # Lists sources from which different datatypes come from
 sources = { 'sequence' : [ 'gisaid', 'fauna', 'fauna_mumps', 'vipr' ], ## duplication of keys in fasta_headers
             'titer' : [ 'crick', 'cdc' ] }
+##### strain_sample from https://github.com/nextstrain/sacra/blob/schema/schema/schema_zika.json#L100
 # For each sequence source, the default order of fields in the fasta header
-fasta_headers = { 'gisaid' : [ 'accession', 'strain', 'isolate_id', 'locus', 'passage', 'submitting_lab' ],
-                  'fauna' : [ 'strain', 'virus', 'accession', 'collection_date', 'region', 'country', 'division', 'location', 'passage', 'source', 'age' ],
-                  'fauna_mumps' : [ 'strain', 'virus', 'accession', 'collection_date', 'country', 'division', 'muv_genotype', 'host', 'authors', 'publication_name', 'journal', 'attribution_url', 'accession_url' ],
-                  'vipr': [ 'accession', 'strain', 'locus', 'date', 'host', 'country', 'subtype', 'virus' ] }
+fasta_headers = { 'gisaid' : [ 'sequence_name', 'strain_name', 'sample_name', 'locus', 'passage', 'sequencing_lab' ],
+                  'fauna' : [ 'strain', 'pathogen', 'sequence_name', 'collection_date', 'region', 'country', 'division', 'location', 'passage', 'source', 'age' ],
+                  'vipr': [ 'sequence_name', 'strain', 'locus', 'date', 'host', 'country', 'subtype', 'pathogen' ] }
 
 
 metadata_fields = set( [ 'isolate_id', 'subtype', 'submitting_lab', 'passage_history', 'location', 'collection_date' ] )
-required_fields = { 'sequence' : { 'strain', 'date', 'accession', 'source', 'locus', 'sequence', 'isolate_id' } }
+required_fields = { 'sequence' : { 'strain', 'date', 'sequence_name', 'source', 'locus', 'sequence', 'isolate_id' } }
 optional_fields = { 'sequence': { 'strain', 'date', 'passage_category', 'source', 'submitting_lab',
-                                  'accession', 'host_age', 'locus', 'sequence', 'isolate_id' } }
+                                  'sequence_name', 'host_age', 'locus', 'sequence', 'isolate_id' } }
 
 
 ### Mappings used by cleaning functions ###

diff --git a/src/cleaning_functions/__init__.py b/src/cleaning_functions/__init__.py
diff --git a/src/cleaning_functions/create/__init__.py b/src/cleaning_functions/create/__init__.py
diff --git a/src/cleaning_functions/drop/__init__.py b/src/cleaning_functions/drop/__init__.py
diff --git a/src/cleaning_functions/fix/__init__.py b/src/cleaning_functions/fix/__init__.py