Restore automatic boolean conversion

This was not considered when automatic nullable numeric conversion was applied¹, but it continued to work because augur filter continued to rely on pandas.read_csv's auto type inference up until it was disabled in favor of reading all columns as string². A note to include boolean was added³ but removed inadvertently in another big change⁴. ¹ b325b97: Try converting all columns to numerical type ² 9f9be3a: Read all metadata as string type ³ 725e1b4: Expand comment on numeric conversion ⁴ b0a0d11: Add --query-columns option
nextstrain · Feb 9, 2024 · a3fada4 · a3fada4
1 parent 324e120
commit a3fada4
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 12 deletions.
diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
@@ -201,18 +201,31 @@ def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[D
         # Column extraction failed. Apply type conversion to all columns.
         columns = metadata_copy.columns
 
-    # If a type is not explicitly provided, try converting the column to numeric.
-    # pd.to_numeric supports nullable numeric columns unlike pd.read_csv's
-    # built-in data type inference.
+    # If a type is not explicitly provided, try automatic conversion.
     for column in columns:
-        column_types.setdefault(column, 'numeric')
+        column_types.setdefault(column, 'auto')
 
     # Convert data types before applying the query.
     # NOTE: This can behave differently between different chunks of metadata,
     # but it's the best we can do.
     for column, dtype in column_types.items():
-        if dtype == 'numeric':
-            metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
+        if dtype == 'auto':
+            # Try numeric conversion followed by boolean conversion.
+            try:
+                # pd.to_numeric supports nullable numeric columns unlike pd.read_csv's
+                # built-in data type inference.
+                metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise')
+            except:
+                boolean_map = {
+                    'True': True,
+                    'False': False,
+                }
+                # Try boolean conversion only when all column values are 'True'/'False'.
+                if set(metadata_copy[column].unique()) == set(boolean_map.keys()):
+                    metadata_copy[column] = metadata_copy[column].map(boolean_map)
+
+                # If both conversions fail, the column remains string type.
+
         elif dtype == 'int':
             try:
                 metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='integer')

diff --git a/tests/functional/filter/cram/filter-query-boolean.t b/tests/functional/filter/cram/filter-query-boolean.t
@@ -11,16 +11,16 @@ Create metadata file for testing.
   > SEQ_3	False
   > ~~
 
-Ideally, the column should be query-able by boolean comparisons.
-This does not currently work because all dtypes are strings.
+The column is query-able by boolean comparisons.
 
   $ ${AUGUR} filter \
   >  --metadata metadata.tsv \
   >  --query "column == True" \
   >  --output-strains filtered_strains.txt
-  ERROR: All samples have been dropped! Check filter rules and metadata file format.
-  3 strains were dropped during filtering
-  \t3 were filtered out by the query: "column == True" (esc)
-  [2]
+  1 strain was dropped during filtering
+  	1 was filtered out by the query: "column == True"
+  2 strains passed all filters
 
   $ sort filtered_strains.txt
+  SEQ_1
+  SEQ_2