nextstrain · victorlin · Feb 12, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -9,6 +9,7 @@
 
 ### Bug Fixes
 
+* filter: In version 24.1.0, automatic conversion of boolean columns was accidentally removed. It has been restored with additional support for empty values evaluated as `None`. [#1410][] (@victorlin)
 * filter: The order of rows in `--output-metadata` and `--output-strains` now reflects the order in the original `--metadata`. [#1294][] (@victorlin)
 * filter, frequencies, refine: Performance improvements to reading the input metadata file. [#1294][] (@victorlin)
     * For filter, this comes with increased writing times for `--output-metadata` and `--output-strains`. However, net I/O speed still decreased during testing of this change.
@@ -18,6 +19,7 @@
 
 [#1294]: https://github.com/nextstrain/augur/pull/1294
 [#1389]: https://github.com/nextstrain/augur/pull/1389
+[#1410]: https://github.com/nextstrain/augur/pull/1410
 
 ## 24.1.0 (30 January 2024)
 

diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
@@ -201,19 +201,27 @@ def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[D
         # Column extraction failed. Apply type conversion to all columns.
         columns = metadata_copy.columns
 
-    # If a type is not explicitly provided, try converting the column to numeric.
-    # This should cover most use cases, since one common problem is that the
-    # built-in data type inference when loading the DataFrame does not
-    # support nullable numeric columns, so numeric comparisons won't work on
-    # those columns. pd.to_numeric does proper conversion on those columns,
-    # and will not make any changes to columns with other values.
+    # If a type is not explicitly provided, try automatic conversion.
     for column in columns:
-        column_types.setdefault(column, 'numeric')
+        column_types.setdefault(column, 'auto')
 
     # Convert data types before applying the query.
+    # NOTE: This can behave differently between different chunks of metadata,
+    # but it's the best we can do.
     for column, dtype in column_types.items():
-        if dtype == 'numeric':
-            metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
+        if dtype == 'auto':
+            # Try numeric conversion followed by boolean conversion.
+            try:
+                # pd.to_numeric supports nullable numeric columns unlike pd.read_csv's
+                # built-in data type inference.
+                metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise')
+            except:
+                try:
+                    metadata_copy[column] = metadata_copy[column].map(_string_to_boolean)
+                except ValueError:
+                    # If both conversions fail, column values are preserved as strings.
+                    pass
+
         elif dtype == 'int':
             try:
                 metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='integer')
@@ -224,6 +232,11 @@ def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[D
                 metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='float')
             except ValueError as e:
                 raise AugurError(f"Failed to convert value in column {column!r} to float. {e}")
+        elif dtype == 'bool':
+            try:
+                metadata_copy[column] = metadata_copy[column].map(_string_to_boolean)
+            except ValueError as e:
+                raise AugurError(f"Failed to convert value in column {column!r} to bool. {e}")
         elif dtype == 'str':
             metadata_copy[column] = metadata_copy[column].astype('str', errors='ignore')
 
@@ -235,6 +248,21 @@ def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[D
         raise AugurError(f"Internal Pandas error when applying query:\n\t{e}\nEnsure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.") from e
 
 
+def _string_to_boolean(s: str):
+    """Convert a string to an optional boolean value.
+
+    Raises ValueError if it cannot be converted.
+    """
+    if s.lower() == 'true':
+        return True
+    elif s.lower() == 'false':
+        return False
+    elif s == '':
+        return None
+
+    raise ValueError(f"Unable to convert {s!r} to a boolean value.")
+
+
 def filter_by_ambiguous_date(metadata, date_column, ambiguity) -> FilterFunctionReturn:
     """Filter metadata in the given pandas DataFrame where values in the given date
     column have a given level of ambiguity.

diff --git a/augur/filter/io.py b/augur/filter/io.py
@@ -128,7 +128,7 @@ def write_metadata_based_outputs(input_metadata_path: str, delimiters: Sequence[
 
 
 # These are the types accepted in the following function.
-ACCEPTED_TYPES = {'numeric', 'int', 'float', 'str'}
+ACCEPTED_TYPES = {'int', 'float', 'bool', 'str'}
 
 def column_type_pair(input: str):
     """Get a 2-tuple for column name to type.

diff --git a/tests/functional/filter/cram/filter-query-boolean.t b/tests/functional/filter/cram/filter-query-boolean.t
@@ -0,0 +1,78 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+A column with True and False values is query-able by boolean comparisons.
+
+  $ cat >metadata.tsv <<~~
+  > strain	column
+  > SEQ_1	True
+  > SEQ_2	True
+  > SEQ_3	False
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "column == True" \
+  >  --output-strains filtered_strains.txt
+  1 strain was dropped during filtering
+  	1 was filtered out by the query: "column == True"
+  2 strains passed all filters
+
+  $ sort filtered_strains.txt
+  SEQ_1
+  SEQ_2
+
+Note that the string value is case-insensitive.
+
+  $ cat >metadata.tsv <<~~
+  > strain	column
+  > SEQ_1	True
+  > SEQ_2	trUe
+  > SEQ_3	FALSE
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "column == True" \
+  >  --output-strains filtered_strains.txt
+  1 strain was dropped during filtering
+  	1 was filtered out by the query: "column == True"
+  2 strains passed all filters
+
+Note that 1/0 can also be compared to boolean literals.
+
+  $ cat >metadata.tsv <<~~
+  > strain	column
+  > SEQ_1	1
+  > SEQ_2	1
+  > SEQ_3	0
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "column == True" \
+  >  --output-strains filtered_strains.txt
+  1 strain was dropped during filtering
+  	1 was filtered out by the query: "column == True"
+  2 strains passed all filters
+
+Empty values are ignored.
+
+  $ cat >metadata.tsv <<~~
+  > strain	column
+  > SEQ_1	True
+  > SEQ_2	False
+  > SEQ_3	
+  > ~~
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "column == True" \
+  >  --output-strains filtered_strains.txt
+  2 strains were dropped during filtering
+  	2 were filtered out by the query: "column == True"
+  1 strain passed all filters
+
+  $ sort filtered_strains.txt
+  SEQ_1
diff --git a/tests/functional/filter/cram/filter-query-columns.t b/tests/functional/filter/cram/filter-query-columns.t
@@ -53,3 +53,13 @@ Specifying category:float does not work.
   >  --output-strains filtered_strains.txt
   ERROR: Failed to convert value in column 'category' to float. Unable to parse string "A" at position 0
   [2]
+
+Specifying category:bool also does not work.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage >= 0.95 & category == 'B'" \
+  >  --query-columns category:bool \
+  >  --output-strains filtered_strains.txt
+  ERROR: Failed to convert value in column 'category' to bool. Unable to convert 'A' to a boolean value.
+  [2]