Skip to content

Commit

Permalink
Merge pull request #1268: filter: Try converting all queried columns …
Browse files Browse the repository at this point in the history
…to numerical type
  • Loading branch information
victorlin committed Jul 31, 2023
2 parents 632585c + ce756c3 commit a35f7a6
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
* export v1: Added a deprecation warning for this command. [#1265][] (@victorlin)
* export v1: The recently introduced flag `--metadata-id-columns` did not work properly due to the same `export v2` bug that was fixed in this release. Instead of fixing it in `export v1`, drop the broken feature since this command is no longer being maintained. [#1265][] (@victorlin)
* filter: Expose internal Pandas errors from `--query` which may be useful to users. [#1267][] (@victorlin)
* filter: Previously, `--query` would fail when numerical comparisons were used on columns with missing values. This has been fixed. [#1269][] (@victorlin)

[#1260]: https://github.com/nextstrain/augur/issues/1260
[#1262]: https://github.com/nextstrain/augur/issues/1262
[#1265]: https://github.com/nextstrain/augur/pull/1265
[#1267]: https://github.com/nextstrain/augur/pull/1267
[#1269]: https://github.com/nextstrain/augur/issues/1269

## 22.1.0 (10 July 2023)

Expand Down
49 changes: 43 additions & 6 deletions augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,18 @@
from augur.utils import read_strains
from . import constants

try:
# python ≥3.8 only
from typing import Literal # type: ignore
except ImportError:
from typing_extensions import Literal # type: ignore

try:
# pandas ≥1.5.0 only
PandasUndefinedVariableError = pd.errors.UndefinedVariableError # type: ignore
except AttributeError:
PandasUndefinedVariableError = pd.core.computation.ops.UndefinedVariableError # type: ignore


# The strains to keep as a result of applying a filter function.
FilterFunctionReturn = Set[str]
Expand Down Expand Up @@ -178,6 +190,10 @@ def filter_by_query(metadata, query) -> FilterFunctionReturn:
set()
"""
# Try converting all queried columns to numeric.
for column in extract_variables(query).intersection(metadata.columns):
metadata[column] = pd.to_numeric(metadata[column], errors='ignore')

return set(metadata.query(query).index.values)


Expand Down Expand Up @@ -711,12 +727,7 @@ def apply_filters(metadata, exclude_by: List[FilterOption], include_by: List[Fil
)
except Exception as e:
if filter_function is filter_by_query:
try:
# pandas ≥1.5.0 only
UndefinedVariableError = pd.errors.UndefinedVariableError # type: ignore
except AttributeError:
UndefinedVariableError = pd.core.computation.ops.UndefinedVariableError # type: ignore
if isinstance(e, UndefinedVariableError):
if isinstance(e, PandasUndefinedVariableError):
raise AugurError(f"Query contains a column that does not exist in metadata.") from e
raise AugurError(f"Internal Pandas error when applying query:\n\t{e}\nEnsure the syntax is valid per <https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-query>.") from e
else:
Expand Down Expand Up @@ -799,3 +810,29 @@ def _filter_kwargs_to_str(kwargs: FilterFunctionKwargs):
kwarg_list.append((key, value))

return json.dumps(kwarg_list)


# From https://stackoverflow.com/a/76536356
def extract_variables(pandas_query: str):
"""Extract variable names used in a pandas query string."""

# Track variables in a dictionary to be used as a dictionary of globals.
variables: Dict[str, Literal[None]] = {}

while True:
try:
# Try creating a Expr object with the query string and dictionary of globals.
# This will raise an error as long as the dictionary of globals is incomplete.
env = pd.core.computation.scope.ensure_scope(level=0, global_dict=variables)
pd.core.computation.expr.Expr(pandas_query, env=env)

# Exit the loop when evaluation is successful.
break
except PandasUndefinedVariableError as e:
# This relies on the format defined here: https://github.com/pandas-dev/pandas/blob/965ceca9fd796940050d6fc817707bba1c4f9bff/pandas/errors/__init__.py#L401
name = re.findall("name '(.+?)' is not defined", str(e))[0]

# Add the name to the globals dictionary with a dummy value.
variables[name] = None

return set(variables.keys())
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
"sphinx-autodoc-typehints >=1.21.4",
"types-jsonschema >=3.0.0, ==3.*",
"types-setuptools",
"typing_extensions; python_version <'3.8'",
"wheel >=0.32.3",
"ipdb >=0.10.1"
]
Expand Down
24 changes: 24 additions & 0 deletions tests/functional/filter/cram/filter-query-numerical.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Setup

$ source "$TESTDIR"/_setup.sh

Create metadata file for testing.

$ cat >metadata.tsv <<~~
> strain coverage
> SEQ_1 0.94
> SEQ_2 0.95
> SEQ_3 0.96
> SEQ_4
> ~~
The 'coverage' column should be query-able by numerical comparisons.
$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --query "coverage >= 0.95" \
> --output-strains filtered_strains.txt > /dev/null
$ sort filtered_strains.txt
SEQ_2
SEQ_3

0 comments on commit a35f7a6

Please sign in to comment.