From a3fada448da9c4bea9f8b4ba6ff0270ce5d13fc5 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Fri, 9 Feb 2024 11:30:00 -0800 Subject: [PATCH] Restore automatic boolean conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was not considered when automatic nullable numeric conversion was applied¹, but it continued to work because augur filter continued to rely on pandas.read_csv's auto type inference up until it was disabled in favor of reading all columns as string². A note to include boolean was added³ but removed inadvertently in another big change⁴. ¹ b325b970: Try converting all columns to numerical type ² 9f9be3a1: Read all metadata as string type ³ 725e1b44: Expand comment on numeric conversion ⁴ b0a0d112: Add --query-columns option --- augur/filter/include_exclude_rules.py | 25 ++++++++++++++----- .../filter/cram/filter-query-boolean.t | 12 ++++----- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py index c324861e0..b675a0b68 100644 --- a/augur/filter/include_exclude_rules.py +++ b/augur/filter/include_exclude_rules.py @@ -201,18 +201,31 @@ def filter_by_query(metadata: pd.DataFrame, query: str, column_types: Optional[D # Column extraction failed. Apply type conversion to all columns. columns = metadata_copy.columns - # If a type is not explicitly provided, try converting the column to numeric. - # pd.to_numeric supports nullable numeric columns unlike pd.read_csv's - # built-in data type inference. + # If a type is not explicitly provided, try automatic conversion. for column in columns: - column_types.setdefault(column, 'numeric') + column_types.setdefault(column, 'auto') # Convert data types before applying the query. # NOTE: This can behave differently between different chunks of metadata, # but it's the best we can do. for column, dtype in column_types.items(): - if dtype == 'numeric': - metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='ignore') + if dtype == 'auto': + # Try numeric conversion followed by boolean conversion. + try: + # pd.to_numeric supports nullable numeric columns unlike pd.read_csv's + # built-in data type inference. + metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise') + except: + boolean_map = { + 'True': True, + 'False': False, + } + # Try boolean conversion only when all column values are 'True'/'False'. + if set(metadata_copy[column].unique()) == set(boolean_map.keys()): + metadata_copy[column] = metadata_copy[column].map(boolean_map) + + # If both conversions fail, the column remains string type. + elif dtype == 'int': try: metadata_copy[column] = pd.to_numeric(metadata_copy[column], errors='raise', downcast='integer') diff --git a/tests/functional/filter/cram/filter-query-boolean.t b/tests/functional/filter/cram/filter-query-boolean.t index 84e599aaa..8b89ef356 100644 --- a/tests/functional/filter/cram/filter-query-boolean.t +++ b/tests/functional/filter/cram/filter-query-boolean.t @@ -11,16 +11,16 @@ Create metadata file for testing. > SEQ_3 False > ~~ -Ideally, the column should be query-able by boolean comparisons. -This does not currently work because all dtypes are strings. +The column is query-able by boolean comparisons. $ ${AUGUR} filter \ > --metadata metadata.tsv \ > --query "column == True" \ > --output-strains filtered_strains.txt - ERROR: All samples have been dropped! Check filter rules and metadata file format. - 3 strains were dropped during filtering - \t3 were filtered out by the query: "column == True" (esc) - [2] + 1 strain was dropped during filtering + 1 was filtered out by the query: "column == True" + 2 strains passed all filters $ sort filtered_strains.txt + SEQ_1 + SEQ_2