Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: ne comparison returns False for NA and other value #56123

Merged
merged 5 commits into from Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.4.rst
Expand Up @@ -29,6 +29,7 @@ Bug fixes
- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`)
- Fixed bug in :meth:`Series.mode` not keeping object dtype when ``infer_string`` is set (:issue:`56183`)
- Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/arrays/string_arrow.py
@@ -1,6 +1,7 @@
from __future__ import annotations

from functools import partial
import operator
import re
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -663,7 +664,10 @@ def _convert_int_dtype(self, result):

def _cmp_method(self, other, op):
result = super()._cmp_method(other, op)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this be fixed in the ArrowExtensionArray._cmp_method?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, the arrow dtypes will return an arrow array with bool dtype that has NA values in those places, which is correct, this fix is to emulate object dtype

return result.to_numpy(np.bool_, na_value=False)
if op == operator.ne:
return result.to_numpy(np.bool_, na_value=True)
else:
return result.to_numpy(np.bool_, na_value=False)

def value_counts(self, dropna: bool = True) -> Series:
from pandas import Series
Expand Down
29 changes: 18 additions & 11 deletions pandas/tests/arithmetic/test_object.py
Expand Up @@ -10,10 +10,13 @@

from pandas._config import using_pyarrow_string_dtype

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Series,
Timestamp,
option_context,
)
import pandas._testing as tm
from pandas.core import ops
Expand All @@ -33,20 +36,24 @@ def test_comparison_object_numeric_nas(self, comparison_op):
expected = func(ser.astype(float), shifted.astype(float))
tm.assert_series_equal(result, expected)

def test_object_comparisons(self):
ser = Series(["a", "b", np.nan, "c", "a"])
@pytest.mark.parametrize(
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
)
def test_object_comparisons(self, infer_string):
with option_context("future.infer_string", infer_string):
ser = Series(["a", "b", np.nan, "c", "a"])

result = ser == "a"
expected = Series([True, False, False, False, True])
tm.assert_series_equal(result, expected)
result = ser == "a"
expected = Series([True, False, False, False, True])
tm.assert_series_equal(result, expected)

result = ser < "a"
expected = Series([False, False, False, False, False])
tm.assert_series_equal(result, expected)
result = ser < "a"
expected = Series([False, False, False, False, False])
tm.assert_series_equal(result, expected)

result = ser != "a"
expected = -(ser == "a")
tm.assert_series_equal(result, expected)
result = ser != "a"
expected = -(ser == "a")
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", [None, object])
def test_more_na_comparisons(self, dtype):
Expand Down
26 changes: 20 additions & 6 deletions pandas/tests/arrays/string_/test_string.py
Expand Up @@ -2,6 +2,8 @@
This module tests the functionality of StringArray and ArrowStringArray.
Tests for the str accessors are in pandas/tests/strings/test_string_array.py
"""
import operator

import numpy as np
import pytest

Expand Down Expand Up @@ -229,7 +231,10 @@ def test_comparison_methods_scalar(comparison_op, dtype):
result = getattr(a, op_name)(other)
if dtype.storage == "pyarrow_numpy":
expected = np.array([getattr(item, op_name)(other) for item in a])
expected[1] = False
if comparison_op == operator.ne:
expected[1] = True
else:
expected[1] = False
tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
else:
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
Expand All @@ -244,7 +249,10 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
result = getattr(a, op_name)(pd.NA)

if dtype.storage == "pyarrow_numpy":
expected = np.array([False, False, False])
if operator.ne == comparison_op:
expected = np.array([True, True, True])
else:
expected = np.array([False, False, False])
tm.assert_numpy_array_equal(result, expected)
else:
expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
Expand All @@ -270,7 +278,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
if dtype.storage == "pyarrow_numpy":
expected_data = {
"__eq__": [False, False, False],
"__ne__": [True, False, True],
"__ne__": [True, True, True],
}[op_name]
expected = np.array(expected_data)
tm.assert_numpy_array_equal(result, expected)
Expand All @@ -290,12 +298,18 @@ def test_comparison_methods_array(comparison_op, dtype):
other = [None, None, "c"]
result = getattr(a, op_name)(other)
if dtype.storage == "pyarrow_numpy":
expected = np.array([False, False, False])
expected[-1] = getattr(other[-1], op_name)(a[-1])
if operator.ne == comparison_op:
expected = np.array([True, True, False])
else:
expected = np.array([False, False, False])
expected[-1] = getattr(other[-1], op_name)(a[-1])
tm.assert_numpy_array_equal(result, expected)

result = getattr(a, op_name)(pd.NA)
expected = np.array([False, False, False])
if operator.ne == comparison_op:
expected = np.array([True, True, True])
else:
expected = np.array([False, False, False])
tm.assert_numpy_array_equal(result, expected)

else:
Expand Down