From aacdf613a897320ad2a12a0397f0f8ebab349093 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 7 Dec 2023 22:19:15 +0100 Subject: [PATCH] Backport PR #56123 on branch 2.1.x (BUG: ne comparison returns False for NA and other value) (#56382) --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/arrays/string_arrow.py | 6 ++++- pandas/tests/arithmetic/test_object.py | 29 ++++++++++++++-------- pandas/tests/arrays/string_/test_string.py | 26 ++++++++++++++----- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 6cd44d954a708..26fd23d80208c 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -30,6 +30,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) - diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3b474774b7873..803e032b452cf 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import operator import re from typing import ( TYPE_CHECKING, @@ -600,7 +601,10 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): def _cmp_method(self, other, op): result = super()._cmp_method(other, op) - return result.to_numpy(np.bool_, na_value=False) + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) def value_counts(self, dropna: bool = True): from pandas import Series diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index 5ffbf1a38e845..76e38500df5a2 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -8,10 +8,13 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core import ops @@ -31,20 +34,24 @@ def test_comparison_object_numeric_nas(self, comparison_op): expected = func(ser.astype(float), shifted.astype(float)) tm.assert_series_equal(result, expected) - def test_object_comparisons(self): - ser = Series(["a", "b", np.nan, "c", "a"]) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_object_comparisons(self, infer_string): + with option_context("future.infer_string", infer_string): + ser = Series(["a", "b", np.nan, "c", "a"]) - result = ser == "a" - expected = Series([True, False, False, False, True]) - tm.assert_series_equal(result, expected) + result = ser == "a" + expected = Series([True, False, False, False, True]) + tm.assert_series_equal(result, expected) - result = ser < "a" - expected = Series([False, False, False, False, False]) - tm.assert_series_equal(result, expected) + result = ser < "a" + expected = Series([False, False, False, False, False]) + tm.assert_series_equal(result, expected) - result = ser != "a" - expected = -(ser == "a") - tm.assert_series_equal(result, expected) + result = ser != "a" + expected = -(ser == "a") + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) def test_more_na_comparisons(self, dtype): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 68b2b01ddcaf3..b9c59018fa797 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,6 +2,8 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +import operator + import numpy as np import pytest @@ -221,7 +223,10 @@ def test_comparison_methods_scalar(comparison_op, dtype): result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": expected = np.array([getattr(item, op_name)(other) for item in a]) - expected[1] = False + if comparison_op == operator.ne: + expected[1] = True + else: + expected[1] = False tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -236,7 +241,10 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype): result = getattr(a, op_name)(pd.NA) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" @@ -262,7 +270,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype): if dtype.storage == "pyarrow_numpy": expected_data = { "__eq__": [False, False, False], - "__ne__": [True, False, True], + "__ne__": [True, True, True], }[op_name] expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) @@ -282,12 +290,18 @@ def test_comparison_methods_array(comparison_op, dtype): other = [None, None, "c"] result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([False, False, False]) - expected[-1] = getattr(other[-1], op_name)(a[-1]) + if operator.ne == comparison_op: + expected = np.array([True, True, False]) + else: + expected = np.array([False, False, False]) + expected[-1] = getattr(other[-1], op_name)(a[-1]) tm.assert_numpy_array_equal(result, expected) result = getattr(a, op_name)(pd.NA) - expected = np.array([False, False, False]) + if operator.ne == comparison_op: + expected = np.array([True, True, True]) + else: + expected = np.array([False, False, False]) tm.assert_numpy_array_equal(result, expected) else: