Merge pull request #1241: Minor improvements

nextstrain · Jun 23, 2023 · 4fbb5df · 4fbb5df
2 parents cfb5255 + 3f8a284
commit 4fbb5df
Show file tree

Hide file tree

Showing 7 changed files with 104 additions and 5 deletions.
diff --git a/augur/dates/__init__.py b/augur/dates/__init__.py
@@ -120,8 +120,6 @@ def get_numerical_date_from_value(value, fmt=None, min_max_year=None):
             ambig_date = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year)
         except InvalidDate as error:
             raise AugurError(str(error)) from error
-        if ambig_date is None or None in ambig_date:
-            return [None, None] #don't send to numeric_date or will be set to today
         return [treetime.utils.numeric_date(d) for d in ambig_date]
     try:
         return treetime.utils.numeric_date(datetime.datetime.strptime(value, fmt))

diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -71,7 +71,6 @@ def run(args):
         if build_sequence_index:
             os.unlink(sequence_index_path)
 
-        # Calculate summary statistics needed for filtering.
         sequence_strains = set(sequence_index.index.values)
 
     #####################################

diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py
@@ -23,7 +23,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
     metadata : pandas.DataFrame
         Metadata to inspect for the given strains.
     group_by : list
-        A list of metadata (or calculated) columns to group records by.
+        A list of metadata (or generated) columns to group records by.
 
     Returns
     -------
@@ -39,7 +39,7 @@ def get_groups_for_subsampling(strains, metadata, group_by=None):
     >>> group_by_strain
     {'strain1': ('Africa',), 'strain2': ('Europe',)}
 
-    If we group by year or month, these groups are calculated from the date
+    If we group by year or month, these groups are generated from the date
     string.
 
     >>> group_by = ["year", "month"]

diff --git a/tests/functional/filter/cram/filter-output-metadata-header.t b/tests/functional/filter/cram/filter-output-metadata-header.t
@@ -0,0 +1,48 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Since Pandas's read_csv() and to_csv() are used with a double-quote character as
+the default quotechar, any column names with that character may be altered.
+
+Quoted columns containing the tab delimiter are left unchanged.
+
+  $ cat >metadata.tsv <<~~
+  > strain	"col	1"
+  > SEQ_1	a
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --output-metadata filtered_metadata.tsv > /dev/null
+
+  $ head -n 1 filtered_metadata.tsv
+  strain	"col	1"
+
+Quoted columns without the tab delimiter are stripped of the quotes.
+
+  $ cat >metadata.tsv <<~~
+  > strain	"col1"
+  > SEQ_1	a
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --output-metadata filtered_metadata.tsv > /dev/null
+
+  $ head -n 1 filtered_metadata.tsv
+  strain	col1
+
+Any other columns with quotes are quoted, and pre-existing quotes are escsaped by doubling up.
+
+  $ cat >metadata.tsv <<~~
+  > strain	col"1	col2"
+  > SEQ_1	a	b
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --output-metadata filtered_metadata.tsv > /dev/null
+
+  $ head -n 1 filtered_metadata.tsv
+  strain	"col""1"	"col2"""
diff --git a/tests/functional/filter/cram/filter-sequences-vcf.t b/tests/functional/filter/cram/filter-sequences-vcf.t
@@ -8,7 +8,11 @@ Filter TB strains from VCF and save as a list of filtered strains.
   >  --sequences "$TESTDIR/../data/tb.vcf.gz" \
   >  --metadata "$TESTDIR/../data/tb_metadata.tsv" \
   >  --min-date 2012 \
+  >  --output filtered.vcf \
   >  --output-strains filtered_strains.txt > /dev/null
   Note: You did not provide a sequence index, so Augur will generate one. You can generate your own index ahead of time with `augur index` and pass it with `augur filter --sequence-index`.
   $ wc -l filtered_strains.txt
   \s*3 .* (re)
+
+  $ wc -l filtered.vcf
+  \s*2314 .* (re)
diff --git a/tests/functional/filter/cram/subsample-group-by-empty-value.t b/tests/functional/filter/cram/subsample-group-by-empty-value.t
@@ -0,0 +1,31 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+  $ cat >metadata-no-date.tsv <<~~
+  > strain	col1	col2	col3
+  > SEQ1			b
+  > SEQ2			b
+  > SEQ3		c	d
+  > SEQ4		c	d
+  > ~~
+
+An empty value in a --group-by column is still treated as a value for grouping.
+
+I.e. the groups here are:
+1. (None, None, b)
+2. (None, c   , d)
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata-no-date.tsv \
+  >   --group-by col1 col2 col3 \
+  >   --sequences-per-group 1 \
+  >   --subsample-seed 0 \
+  >   --output-log filtered-log.tsv \
+  >   --output-strains filtered-strains.txt > /dev/null
+  $ cat filtered-strains.txt
+  SEQ1
+  SEQ3
+  $ tail -n+2 filtered-log.tsv | sort
+  SEQ2\tsubsampling\t (esc)
+  SEQ4\tsubsampling\t (esc)
diff --git a/tests/functional/filter/cram/subsample-priority-file-error.t b/tests/functional/filter/cram/subsample-priority-file-error.t
@@ -30,3 +30,22 @@ Try running with the above file.
   >  --output-strains filtered_strains.txt > /dev/null
   ERROR: missing or malformed priority scores file priorities.csv
   [2]
+
+Create a priority file that does not have integers.
+
+  $ cat >priorities.csv <<~~
+  > SEQ_1	5
+  > SEQ_2	6
+  > SEQ_3	8
+  > SEQ_4	a
+  > ~~
+
+Try running with the above file.
+
+  $ ${AUGUR} filter \
+  >  --metadata "$TESTDIR/../data/metadata.tsv" \
+  >  --priority priorities.csv \
+  >  --subsample-max-sequences 5 \
+  >  --output-strains filtered_strains.txt > /dev/null
+  ERROR: missing or malformed priority scores file priorities.csv
+  [2]