From a3fada448da9c4bea9f8b4ba6ff0270ce5d13fc5 Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Fri, 9 Feb 2024 11:30:00 -0800
Subject: [PATCH] Restore automatic boolean conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was not considered when automatic nullable numeric conversion was
applied¹, but it continued to work because augur filter continued to
rely on pandas.read_csv's auto type inference up until it was disabled
in favor of reading all columns as string².

A note to include boolean was added³ but removed inadvertently in
another big change⁴.

¹ b325b970: Try converting all columns to numerical type
² 9f9be3a1: Read all metadata as string type
³ 725e1b44: Expand comment on numeric conversion
⁴ b0a0d112: Add --query-columns option
---
 augur/filter/include_exclude_rules.py         | 25 ++++++++++++++-----
 .../filter/cram/filter-query-boolean.t        | 12 ++++-----
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
index c324861e0..b675a0b68 100644
--- a/augur/filter/include_exclude_rules.py
+++ b/augur/filter/include_exclude_rules.py
@@ -201,18 +201,31 @@ def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[D
         # Column extraction failed. Apply type conversion to all columns.
         columns = metadata_copy.columns
 
-    # If a type is not explicitly provided, try converting the column to numeric.
-    # pd.to_numeric supports nullable numeric columns unlike pd.read_csv's
-    # built-in data type inference.
+    # If a type is not explicitly provided, try automatic conversion.
     for column in columns:
-        column_types.setdefault(column, 'numeric')
+        column_types.setdefault(column, 'auto')
 
     # Convert data types before applying the query.
     # NOTE: This can behave differently between different chunks of metadata,
     # but it's the best we can do.
     for column, dtype in column_types.items():
-        if dtype == 'numeric':
-            metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore')
+        if dtype == 'auto':
+            # Try numeric conversion followed by boolean conversion.
+            try:
+                # pd.to_numeric supports nullable numeric columns unlike pd.read_csv's
+                # built-in data type inference.
+                metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise')
+            except:
+                boolean_map = {
+                    'True': True,
+                    'False': False,
+                }
+                # Try boolean conversion only when all column values are 'True'/'False'.
+                if set(metadata_copy[column].unique()) == set(boolean_map.keys()):
+                    metadata_copy[column] = metadata_copy[column].map(boolean_map)
+
+                # If both conversions fail, the column remains string type.
+
         elif dtype == 'int':
             try:
                 metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='integer')
diff --git a/tests/functional/filter/cram/filter-query-boolean.t b/tests/functional/filter/cram/filter-query-boolean.t
index 84e599aaa..8b89ef356 100644
--- a/tests/functional/filter/cram/filter-query-boolean.t
+++ b/tests/functional/filter/cram/filter-query-boolean.t
@@ -11,16 +11,16 @@ Create metadata file for testing.
   > SEQ_3	False
   > ~~
 
-Ideally, the column should be query-able by boolean comparisons.
-This does not currently work because all dtypes are strings.
+The column is query-able by boolean comparisons.
 
   $ ${AUGUR} filter \
   >  --metadata metadata.tsv \
   >  --query "column == True" \
   >  --output-strains filtered_strains.txt
-  ERROR: All samples have been dropped! Check filter rules and metadata file format.
-  3 strains were dropped during filtering
-  \t3 were filtered out by the query: "column == True" (esc)
-  [2]
+  1 strain was dropped during filtering
+  	1 was filtered out by the query: "column == True"
+  2 strains passed all filters
 
   $ sort filtered_strains.txt
+  SEQ_1
+  SEQ_2