diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 040ca048d1224..88740608e015a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -735,6 +735,7 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` losing dtype when setting a :class:`DataFrame` into duplicated columns (:issue:`53143`) - Bug in :meth:`DataFrame.__setitem__` with a boolean mask and :meth:`DataFrame.putmask` with mixed non-numeric dtypes and a value other than ``NaN`` incorrectly raising ``TypeError`` (:issue:`53291`) - Bug in :meth:`DataFrame.iloc` when using ``nan`` as the only element (:issue:`52234`) +- Bug in :meth:`DataFrame.where`, :meth:`DataFrame.mask`, :meth:`Series.where`, and :meth:`Series.mask`, when ``cond`` for an element is ``pd.NA``; the corresponding element now propagates through (:issue:`52955`) - Bug in :meth:`Series.loc` casting :class:`Series` to ``np.dnarray`` when assigning :class:`Series` at predefined index of ``object`` dtype :class:`Series` (:issue:`48933`) Missing diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b9407ebe6624a..fc0f6a8dabb7f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10292,42 +10292,49 @@ def _where( axis = self._get_axis_number(axis) # align the cond to same shape as myself + cond_hasna: bool_t cond = common.apply_if_callable(cond, self) if isinstance(cond, NDFrame): # CoW: Make sure reference is not kept alive - if cond.ndim == 1 and self.ndim == 2: - cond = cond._constructor_expanddim( - {i: cond for i in range(len(self.columns))}, - copy=False, + cond_hasna = cond.isna().any(axis=None) + if not cond_hasna: + if cond.ndim == 1 and self.ndim == 2: + cond = cond._constructor_expanddim( + {i: cond for i in range(len(self.columns))}, + copy=False, + ) + cond.columns = self.columns + cond = cond.align(self, join="right", copy=False)[0].fillna( + bool(inplace) ) - cond.columns = self.columns - cond = cond.align(self, join="right", copy=False)[0] else: if not hasattr(cond, "shape"): cond = np.asanyarray(cond) if cond.shape != self.shape: raise ValueError("Array conditional must be same shape as self") cond = self._constructor(cond, **self._construct_axes_dict(), copy=False) - - # make sure we are boolean - fill_value = bool(inplace) - cond = cond.fillna(fill_value) + cond_hasna = cond.isna().any(axis=None) msg = "Boolean array expected for the condition, not {dtype}" + na_msg = "Cannot mask with an array containing NA / NaN values" if not cond.empty: if not isinstance(cond, ABCDataFrame): # This is a single-dimensional object. if not is_bool_dtype(cond): raise ValueError(msg.format(dtype=cond.dtype)) + if cond_hasna: + raise ValueError(na_msg) else: for _dt in cond.dtypes: if not is_bool_dtype(_dt): raise ValueError(msg.format(dtype=_dt)) + if cond_hasna: + raise ValueError(na_msg) if cond._mgr.any_extension_types: # GH51574: avoid object ndarray conversion later on cond = cond._constructor( - cond.to_numpy(dtype=bool, na_value=fill_value), + cond.to_numpy(dtype=bool), **cond._construct_axes_dict(), ) else: diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index 264e27c9c122e..5387c3ed2bd3d 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -3,11 +3,13 @@ """ import numpy as np +import pytest from pandas import ( NA, DataFrame, Float64Dtype, + Int64Dtype, Series, StringDtype, Timedelta, @@ -150,3 +152,13 @@ def test_mask_inplace_no_other(): df.mask(cond, inplace=True) expected = DataFrame({"a": [np.nan, 2], "b": ["x", np.nan]}) tm.assert_frame_equal(df, expected) + + +def test_mask_with_na(): + # GH#52955 + df = DataFrame([[1, NA], [NA, 2]], dtype=Int64Dtype()) + msg = "Cannot mask with an array containing NA / NaN values" + + for cond_frame in [df, df[0]]: + with pytest.raises(ValueError, match=msg): + df.mask(cond_frame % 2 == 1, 0) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ccc1249088f9a..afb5cdcae9fa0 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1144,19 +1144,6 @@ def test_loc_setitem_all_false_boolean_two_blocks(self): df.loc[indexer, ["b"]] = DataFrame({"b": [5, 6]}, index=[0, 1]) tm.assert_frame_equal(df, expected) - def test_setitem_ea_boolean_mask(self): - # GH#47125 - df = DataFrame([[-1, 2], [3, -4]]) - expected = DataFrame([[0, 2], [3, 0]]) - boolean_indexer = DataFrame( - { - 0: Series([True, False], dtype="boolean"), - 1: Series([pd.NA, True], dtype="boolean"), - } - ) - df[boolean_indexer] = 0 - tm.assert_frame_equal(df, expected) - class TestDataFrameSetitemCopyViewSemantics: def test_setitem_always_copy(self, float_frame): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 3d3df2d714ca4..07f9d9cd3d492 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -11,6 +11,7 @@ DataFrame, DatetimeIndex, Index, + Int64Dtype, Series, StringDtype, Timestamp, @@ -1046,16 +1047,6 @@ def test_where_dt64_2d(): _check_where_equivalences(df, mask, other, expected) -def test_where_producing_ea_cond_for_np_dtype(): - # GH#44014 - df = DataFrame({"a": Series([1, pd.NA, 2], dtype="Int64"), "b": [1, 2, 3]}) - result = df.where(lambda x: x.apply(lambda y: y > 1, axis=1)) - expected = DataFrame( - {"a": Series([pd.NA, pd.NA, 2], dtype="Int64"), "b": [np.nan, 2, 3]} - ) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) @@ -1075,3 +1066,13 @@ def test_where_inplace_no_other(): df.where(cond, inplace=True) expected = DataFrame({"a": [1, np.nan], "b": [np.nan, "y"]}) tm.assert_frame_equal(df, expected) + + +def test_where_with_na(): + # GH#52955 + df = DataFrame([[1, pd.NA], [pd.NA, 2]], dtype=Int64Dtype()) + msg = "Cannot mask with an array containing NA / NaN values" + + for cond_frame in [df, df[0]]: + with pytest.raises(ValueError, match=msg): + df.where(cond_frame % 2 == 1, 0) diff --git a/pandas/tests/series/indexing/test_mask.py b/pandas/tests/series/indexing/test_mask.py index 3c21cd0d5ca64..a3648592e734b 100644 --- a/pandas/tests/series/indexing/test_mask.py +++ b/pandas/tests/series/indexing/test_mask.py @@ -1,7 +1,11 @@ import numpy as np import pytest -from pandas import Series +from pandas import ( + NA, + Int64Dtype, + Series, +) import pandas._testing as tm @@ -67,3 +71,13 @@ def test_mask_inplace(): rs = s.copy() rs.mask(cond, -s, inplace=True) tm.assert_series_equal(rs, s.mask(cond, -s)) + + +def test_mask_with_na(): + # GH#52955 + ser = Series([1, 2, NA], dtype=Int64Dtype()) + msg = "Cannot mask with an array containing NA / NaN values" + + for cond_arr in [ser, ser.array]: + with pytest.raises(ValueError, match=msg): + ser.mask(cond_arr % 2 == 1, 0) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 4e002420dadfc..241f9e7230080 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -5,6 +5,8 @@ import pandas as pd from pandas import ( + NA, + Int64Dtype, Series, Timestamp, date_range, @@ -471,3 +473,13 @@ def test_where_datetimelike_categorical(tz_naive_fixture): res = pd.DataFrame(lvals).where(mask[:, None], pd.DataFrame(rvals)) tm.assert_frame_equal(res, pd.DataFrame(dr)) + + +def test_where_with_na(): + # GH#52955 + ser = Series([1, 2, NA], dtype=Int64Dtype()) + msg = "Cannot mask with an array containing NA / NaN values" + + for cond_arr in [ser, ser.array]: + with pytest.raises(ValueError, match=msg): + ser.where(cond_arr % 2 == 1, 0)