Merge pull request #1418: filter: Read all metadata columns when quer…

…y parsing fails
nextstrain · Feb 16, 2024 · 71015ed · 71015ed
2 parents 4c56d09 + 4e4f0d2
commit 71015ed
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 2 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,11 @@
 
 ## __NEXT__
 
+### Bug Fixes
+
+* filter: In versions 24.2.0 and 24.2.1, `--query` stopped working in cases where internal optimizations added in version 24.2.0 failed to parse the columns from the query. It now falls back to non-optimized behavior that allows queries to work. [#1418][] (@victorlin)
+
+[#1418]: https://github.com/nextstrain/augur/pull/1418
 
 ## 24.2.1 (14 February 2024)
 

diff --git a/augur/filter/io.py b/augur/filter/io.py
@@ -3,6 +3,7 @@
 from argparse import Namespace
 import os
 import re
+from textwrap import dedent
 from typing import Sequence, Set
 import numpy as np
 from collections import defaultdict
@@ -65,7 +66,16 @@ def get_useful_metadata_columns(args: Namespace, id_column: str, all_columns: Se
         # Attempt to automatically extract columns from the query.
         variables = extract_variables(args.query)
         if variables is None and not args.query_columns:
-            raise AugurError("Could not infer columns from the pandas query. If the query is valid, please specify columns using --query-columns.")
+            print_err(dedent(f"""\
+                WARNING: Could not infer columns from the pandas query. Reading all metadata columns,
+                which may impact execution time. If the query is valid, please open a new issue:
+
+                    <https://github.com/nextstrain/augur/issues/new/choose>
+
+                and add the query to the description:
+
+                    {args.query}"""))
+            columns.update(all_columns)
         else:
             columns.update(variables)
 

diff --git a/tests/functional/filter/cram/filter-query-backtick-quoting.t b/tests/functional/filter/cram/filter-query-backtick-quoting.t
@@ -0,0 +1,32 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Create metadata file for testing.
+
+  $ cat >metadata.tsv <<~~
+  > strain	region name
+  > SEQ_1	A
+  > SEQ_2	A
+  > SEQ_3	B
+  > SEQ_4	
+  > ~~
+
+The 'region name' column is query-able by backtick quoting.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query '(`region name` == "A")' \
+  >  --output-strains filtered_strains.txt > /dev/null
+  WARNING: Could not infer columns from the pandas query. Reading all metadata columns,
+  which may impact execution time. If the query is valid, please open a new issue:
+  
+      <https://github.com/nextstrain/augur/issues/new/choose>
+  
+  and add the query to the description:
+  
+      (`region name` == "A")
+
+  $ sort filtered_strains.txt
+  SEQ_1
+  SEQ_2
diff --git a/tests/functional/filter/cram/filter-query-errors.t b/tests/functional/filter/cram/filter-query-errors.t
@@ -41,5 +41,15 @@ However, other Pandas errors are not so helpful, so a link is provided for users
   >  --metadata "$TESTDIR/../data/metadata.tsv" \
   >  --query "some bad syntax" \
   >  --output-strains filtered_strains.txt > /dev/null
-  ERROR: Could not infer columns from the pandas query. If the query is valid, please specify columns using --query-columns.
+  WARNING: Could not infer columns from the pandas query. Reading all metadata columns,
+  which may impact execution time. If the query is valid, please open a new issue:
+
+      <https://github.com/nextstrain/augur/issues/new/choose>
+
+  and add the query to the description:
+
+      some bad syntax
+  ERROR: Internal Pandas error when applying query:
+  	invalid syntax (<unknown>, line 1)
+  Ensure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.
   [2]