Merge pull request #1268: filter: Try converting all queried columns …

…to numerical type
nextstrain · Jul 31, 2023 · a35f7a6 · a35f7a6
2 parents 632585c + ce756c3
commit a35f7a6
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 6 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -15,11 +15,13 @@
 * export v1: Added a deprecation warning for this command. [#1265][] (@victorlin)
 * export v1: The recently introduced flag `--metadata-id-columns` did not work properly due to the same `export v2` bug that was fixed in this release. Instead of fixing it in `export v1`, drop the broken feature since this command is no longer being maintained. [#1265][] (@victorlin)
 * filter: Expose internal Pandas errors from `--query` which may be useful to users. [#1267][] (@victorlin)
+* filter: Previously, `--query` would fail when numerical comparisons were used on columns with missing values. This has been fixed. [#1269][] (@victorlin)
 
 [#1260]: https://github.com/nextstrain/augur/issues/1260
 [#1262]: https://github.com/nextstrain/augur/issues/1262
 [#1265]: https://github.com/nextstrain/augur/pull/1265
 [#1267]: https://github.com/nextstrain/augur/pull/1267
+[#1269]: https://github.com/nextstrain/augur/issues/1269
 
 ## 22.1.0 (10 July 2023)
 

diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py
@@ -13,6 +13,18 @@
 from augur.utils import read_strains
 from . import constants
 
+try:
+    # python ≥3.8 only
+    from typing import Literal  # type: ignore
+except ImportError:
+    from typing_extensions import Literal  # type: ignore
+
+try:
+    # pandas ≥1.5.0 only
+    PandasUndefinedVariableError = pd.errors.UndefinedVariableError  # type: ignore
+except AttributeError:
+    PandasUndefinedVariableError = pd.core.computation.ops.UndefinedVariableError  # type: ignore
+
 
 # The strains to keep as a result of applying a filter function.
 FilterFunctionReturn = Set[str]
@@ -178,6 +190,10 @@ def filter_by_query(metadata, query) -> FilterFunctionReturn:
     set()
 
     """
+    # Try converting all queried columns to numeric.
+    for column in extract_variables(query).intersection(metadata.columns):
+        metadata[column] = pd.to_numeric(metadata[column], errors='ignore')
+
     return set(metadata.query(query).index.values)
 
 
@@ -711,12 +727,7 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
             )
         except Exception as e:
             if filter_function is filter_by_query:
-                try:
-                    # pandas ≥1.5.0 only
-                    UndefinedVariableError = pd.errors.UndefinedVariableError  # type: ignore
-                except AttributeError:
-                    UndefinedVariableError = pd.core.computation.ops.UndefinedVariableError  # type: ignore
-                if isinstance(e, UndefinedVariableError):
+                if isinstance(e, PandasUndefinedVariableError):
                     raise AugurError(f"Query contains a column that does not exist in metadata.") from e
                 raise AugurError(f"Internal Pandas error when applying query:\n\t{e}\nEnsure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.") from e
             else:
@@ -799,3 +810,29 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
         kwarg_list.append((key, value))
 
     return json.dumps(kwarg_list)
+
+
+# From https://stackoverflow.com/a/76536356
+def extract_variables(pandas_query: str):
+    """Extract variable names used in a pandas query string."""
+
+    # Track variables in a dictionary to be used as a dictionary of globals.
+    variables: Dict[str, Literal[None]] = {}
+
+    while True:
+        try:
+            # Try creating a Expr object with the query string and dictionary of globals.
+            # This will raise an error as long as the dictionary of globals is incomplete.
+            env = pd.core.computation.scope.ensure_scope(level=0, global_dict=variables)
+            pd.core.computation.expr.Expr(pandas_query, env=env)
+
+            # Exit the loop when evaluation is successful.
+            break
+        except PandasUndefinedVariableError as e:
+            # This relies on the format defined here: https://github.com/pandas-dev/pandas/blob/965ceca9fd796940050d6fc817707bba1c4f9bff/pandas/errors/__init__.py#L401
+            name = re.findall("name '(.+?)' is not defined", str(e))[0]
+
+            # Add the name to the globals dictionary with a dummy value.
+            variables[name] = None
+
+    return set(variables.keys())
diff --git a/setup.py b/setup.py
@@ -87,6 +87,7 @@
             "sphinx-autodoc-typehints >=1.21.4",
             "types-jsonschema >=3.0.0, ==3.*",
             "types-setuptools",
+            "typing_extensions; python_version <'3.8'",
             "wheel >=0.32.3",
             "ipdb >=0.10.1"
         ]

diff --git a/tests/functional/filter/cram/filter-query-numerical.t b/tests/functional/filter/cram/filter-query-numerical.t
@@ -0,0 +1,24 @@
+Setup
+
+  $ source "$TESTDIR"/_setup.sh
+
+Create metadata file for testing.
+
+  $ cat >metadata.tsv <<~~
+  > strain	coverage
+  > SEQ_1	0.94
+  > SEQ_2	0.95
+  > SEQ_3	0.96
+  > SEQ_4	
+  > ~~
+
+The 'coverage' column should be query-able by numerical comparisons.
+
+  $ ${AUGUR} filter \
+  >  --metadata metadata.tsv \
+  >  --query "coverage >= 0.95" \
+  >  --output-strains filtered_strains.txt > /dev/null
+
+  $ sort filtered_strains.txt
+  SEQ_2
+  SEQ_3