From 2246a78e2a615207ee208bfa4cc3339a67214035 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20Smr=C5=BE?= Date: Mon, 29 Apr 2024 22:14:06 +0200 Subject: [PATCH] `pd.eval`: `Series` names are now preserved even for `"numexpr"` engine. (#58437) * Eval: Series names are preserved for numexpr Series names are now preserved even when using numexpr engine. Making the behavior consistent with python engine. * Update doc/source/whatsnew/v3.0.0.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/computation/align.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/tests/computation/test_eval.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/computation/align.py | 19 ++++++++---- pandas/core/computation/engines.py | 11 +++++-- pandas/tests/computation/test_eval.py | 43 +++++++++++++++------------ pandas/tests/frame/test_query_eval.py | 16 ++++++---- 5 files changed, 57 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a81fb584c8df9..6ae3a8e00c02f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -469,6 +469,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index c5562fb0284b7..b4e33b8ac75cb 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -160,19 +160,24 @@ def align_terms(terms): # can't iterate so it must just be a constant or single variable if isinstance(terms.value, (ABCSeries, ABCDataFrame)): typ = type(terms.value) - return typ, _zip_axes_from_type(typ, terms.value.axes) - return np.result_type(terms.type), None + name = terms.value.name if isinstance(terms.value, ABCSeries) else None + return typ, _zip_axes_from_type(typ, terms.value.axes), name + return np.result_type(terms.type), None, None # if all resolved variables are numeric scalars if all(term.is_scalar for term in terms): - return result_type_many(*(term.value for term in terms)).type, None + return result_type_many(*(term.value for term in terms)).type, None, None + + # if all input series have a common name, propagate it to the returned series + names = {term.value.name for term in terms if isinstance(term.value, ABCSeries)} + name = names.pop() if len(names) == 1 else None # perform the main alignment typ, axes = _align_core(terms) - return typ, axes + return typ, axes, name -def reconstruct_object(typ, obj, axes, dtype): +def reconstruct_object(typ, obj, axes, dtype, name): """ Reconstruct an object given its type, raw value, and possibly empty (None) axes. @@ -200,7 +205,9 @@ def reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) if not isinstance(typ, partial) and issubclass(typ, PandasObject): - return typ(obj, dtype=res_t, **axes) + if name is None: + return typ(obj, dtype=res_t, **axes) + return typ(obj, dtype=res_t, name=name, **axes) # special case for pathological things like ~True/~False if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 5db05ebe33efd..d2a181cbb3c36 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -54,6 +54,7 @@ def __init__(self, expr) -> None: self.expr = expr self.aligned_axes = None self.result_type = None + self.result_name = None def convert(self) -> str: """ @@ -76,12 +77,18 @@ def evaluate(self) -> object: The result of the passed expression. """ if not self._is_aligned: - self.result_type, self.aligned_axes = align_terms(self.expr.terms) + self.result_type, self.aligned_axes, self.result_name = align_terms( + self.expr.terms + ) # make sure no names in resolvers and locals/globals clash res = self._evaluate() return reconstruct_object( - self.result_type, res, self.aligned_axes, self.expr.terms.return_type + self.result_type, + res, + self.aligned_axes, + self.expr.terms.return_type, + self.result_name, ) @property diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index ebbb31205e264..d8e5908b0c58f 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -737,6 +737,17 @@ def test_and_logic_string_match(self): assert pd.eval(f"{event.str.match('hello').a}") assert pd.eval(f"{event.str.match('hello').a and event.str.match('hello').a}") + def test_eval_keep_name(self, engine, parser): + df = Series([2, 15, 28], name="a").to_frame() + res = df.eval("a + a", engine=engine, parser=parser) + expected = Series([4, 30, 56], name="a") + tm.assert_series_equal(expected, res) + + def test_eval_unmatching_names(self, engine, parser): + variable_name = Series([42], name="series_name") + res = pd.eval("variable_name + 0", engine=engine, parser=parser) + tm.assert_series_equal(variable_name, res) + # ------------------------------------- # gh-12388: Typecasting rules consistency with python @@ -1269,14 +1280,12 @@ def test_assignment_explicit(self): expected["c"] = expected["a"] + expected["b"] tm.assert_frame_equal(df, expected) - def test_column_in(self): + def test_column_in(self, engine): # GH 11235 df = DataFrame({"a": [11], "b": [-32]}) - result = df.eval("a in [11, -32]") - expected = Series([True]) - # TODO: 2022-01-29: Name check failed with numexpr 2.7.3 in CI - # but cannot reproduce locally - tm.assert_series_equal(result, expected, check_names=False) + result = df.eval("a in [11, -32]", engine=engine) + expected = Series([True], name="a") + tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.") def test_assignment_not_inplace(self): @@ -1505,7 +1514,7 @@ def test_date_boolean(self, engine, parser): parser=parser, ) expec = df.dates1 < "20130101" - tm.assert_series_equal(res, expec, check_names=False) + tm.assert_series_equal(res, expec) def test_simple_in_ops(self, engine, parser): if parser != "python": @@ -1620,7 +1629,7 @@ def test_unary_functions(self, fn, engine, parser): got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a) - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) @pytest.mark.parametrize("fn", _binary_math_ops) def test_binary_functions(self, fn, engine, parser): @@ -1637,7 +1646,7 @@ def test_binary_functions(self, fn, engine, parser): got = self.eval(expr, engine=engine, parser=parser) with np.errstate(all="ignore"): expect = getattr(np, fn)(a, b) - tm.assert_almost_equal(got, expect, check_names=False) + tm.assert_almost_equal(got, expect) def test_df_use_case(self, engine, parser): df = DataFrame( @@ -1653,8 +1662,8 @@ def test_df_use_case(self, engine, parser): inplace=True, ) got = df.e - expect = np.arctan2(np.sin(df.a), df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.arctan2(np.sin(df.a), df.b).rename("e") + tm.assert_series_equal(got, expect) def test_df_arithmetic_subexpression(self, engine, parser): df = DataFrame( @@ -1665,8 +1674,8 @@ def test_df_arithmetic_subexpression(self, engine, parser): ) df.eval("e = sin(a + b)", engine=engine, parser=parser, inplace=True) got = df.e - expect = np.sin(df.a + df.b) - tm.assert_series_equal(got, expect, check_names=False) + expect = np.sin(df.a + df.b).rename("e") + tm.assert_series_equal(got, expect) @pytest.mark.parametrize( "dtype, expect_dtype", @@ -1690,10 +1699,10 @@ def test_result_types(self, dtype, expect_dtype, engine, parser): assert df.a.dtype == dtype df.eval("b = sin(a)", engine=engine, parser=parser, inplace=True) got = df.b - expect = np.sin(df.a) + expect = np.sin(df.a).rename("b") assert expect.dtype == got.dtype assert expect_dtype == got.dtype - tm.assert_series_equal(got, expect, check_names=False) + tm.assert_series_equal(got, expect) def test_undefined_func(self, engine, parser): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) @@ -1898,10 +1907,6 @@ def test_equals_various(other): df = DataFrame({"A": ["a", "b", "c"]}, dtype=object) result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") - if USE_NUMEXPR: - # https://github.com/pandas-dev/pandas/issues/10239 - # lose name with numexpr engine. Remove when that's fixed. - expected.name = None tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 94e8e469f21e7..643d342b052a4 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -58,26 +58,26 @@ def test_query_default(self, df, expected1, expected2): result = df.query("A>0") tm.assert_frame_equal(result, expected1) result = df.eval("A+1") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_None(self, df, expected1, expected2): result = df.query("A>0", engine=None) tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine=None) - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_python(self, df, expected1, expected2): result = df.query("A>0", engine="python") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="python") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) def test_query_numexpr(self, df, expected1, expected2): if NUMEXPR_INSTALLED: result = df.query("A>0", engine="numexpr") tm.assert_frame_equal(result, expected1) result = df.eval("A+1", engine="numexpr") - tm.assert_series_equal(result, expected2, check_names=False) + tm.assert_series_equal(result, expected2) else: msg = ( r"'numexpr' is not installed or an unsupported version. " @@ -194,8 +194,12 @@ def test_using_numpy(self, engine, parser): df = Series([0.2, 1.5, 2.8], name="a").to_frame() res = df.eval("@np.floor(a)", engine=engine, parser=parser) expected = np.floor(df["a"]) - if engine == "numexpr": - expected.name = None # See GH 58069 + tm.assert_series_equal(expected, res) + + def test_eval_simple(self, engine, parser): + df = Series([0.2, 1.5, 2.8], name="a").to_frame() + res = df.eval("a", engine=engine, parser=parser) + expected = df["a"] tm.assert_series_equal(expected, res)