Skip to content

Commit

Permalink
ENH: Enable indexing with nullable Boolean (#31591)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsaxton committed Feb 22, 2020
1 parent 80d37ad commit b9bcdc3
Show file tree
Hide file tree
Showing 14 changed files with 109 additions and 66 deletions.
10 changes: 5 additions & 5 deletions doc/source/user_guide/boolean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ Nullable Boolean data type
Indexing with NA values
-----------------------

pandas does not allow indexing with NA values. Attempting to do so
will raise a ``ValueError``.
pandas allows indexing with ``NA`` values in a boolean array, which are treated as ``False``.

.. versionchanged:: 1.0.2

.. ipython:: python
:okexcept:
Expand All @@ -30,12 +31,11 @@ will raise a ``ValueError``.
mask = pd.array([True, False, pd.NA], dtype="boolean")
s[mask]
The missing values will need to be explicitly filled with True or False prior
to using the array as a mask.
If you would prefer to keep the ``NA`` values you can manually fill them with ``fillna(True)``.

.. ipython:: python
s[mask.fillna(False)]
s[mask.fillna(True)]
.. _boolean.kleene:

Expand Down
12 changes: 10 additions & 2 deletions doc/source/user_guide/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ of multi-axis indexing.
slices, **both** the start and the stop are included, when present in the
index! See :ref:`Slicing with labels <indexing.slicing_with_labels>`
and :ref:`Endpoints are inclusive <advanced.endpoints_are_inclusive>`.)
* A boolean array
* A boolean array (any ``NA`` values will be treated as ``False``).
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).

Expand All @@ -75,7 +75,7 @@ of multi-axis indexing.
* An integer e.g. ``5``.
* A list or array of integers ``[4, 3, 0]``.
* A slice object with ints ``1:7``.
* A boolean array.
* A boolean array (any ``NA`` values will be treated as ``False``).
* A ``callable`` function with one argument (the calling Series or DataFrame) and
that returns valid output for indexing (one of the above).

Expand Down Expand Up @@ -374,6 +374,14 @@ For getting values with a boolean array:
df1.loc['a'] > 0
df1.loc[:, df1.loc['a'] > 0]
NA values in a boolean array propogate as ``False``:

.. versionchanged:: 1.0.2

mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
mask
df1[mask]

For getting a value explicitly:

.. ipython:: python
Expand Down
29 changes: 27 additions & 2 deletions doc/source/whatsnew/v1.0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,33 @@ Fixed regressions

.. ---------------------------------------------------------------------------
Indexing with Nullable Boolean Arrays
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`)

.. ipython:: python
s = pd.Series([1, 2, 3, 4])
mask = pd.array([True, True, False, None], dtype="boolean")
s
mask
*pandas 1.0.0-1.0.1*

.. code-block:: python
>>> s[mask]
Traceback (most recent call last):
...
ValueError: cannot mask with array containing NA / NaN values
*pandas 1.0.2*

.. ipython:: python
s[mask]
.. _whatsnew_102.bug_fixes:

Bug fixes
Expand All @@ -45,8 +72,6 @@ Bug fixes
- Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
- Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`).



**Experimental dtypes**

- Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,9 @@ def __getitem__(self, key):
if com.is_bool_indexer(key):
# first convert to boolean, because check_array_indexer doesn't
# allow object dtype
key = np.asarray(key, dtype=bool)
if is_object_dtype(key):
key = np.asarray(key, dtype=bool)

key = check_array_indexer(self, key)
if key.all():
key = slice(0, None, None)
Expand Down
7 changes: 1 addition & 6 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,24 +118,19 @@ def is_bool_indexer(key: Any) -> bool:
check_array_indexer : Check that `key` is a valid array to index,
and convert to an ndarray.
"""
na_msg = "cannot mask with array containing NA / NaN values"
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
is_array_like(key) and is_extension_array_dtype(key.dtype)
):
if key.dtype == np.object_:
key = np.asarray(values_from_object(key))

if not lib.is_bool_array(key):
na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
if isna(key).any():
raise ValueError(na_msg)
return False
return True
elif is_bool_dtype(key.dtype):
# an ndarray with bool-dtype by definition has no missing values.
# So we only need to check for NAs in ExtensionArrays
if is_extension_array_dtype(key.dtype):
if np.any(key.isna()):
raise ValueError(na_msg)
return True
elif isinstance(key, list):
try:
Expand Down
14 changes: 6 additions & 8 deletions pandas/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_extension_array_dtype,
is_integer_dtype,
is_list_like,
)
Expand Down Expand Up @@ -366,14 +367,11 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
...
IndexError: Boolean index has wrong length: 3 instead of 2.
A ValueError is raised when the mask cannot be converted to
a bool-dtype ndarray.
NA values in a boolean array are treated as False.
>>> mask = pd.array([True, pd.NA])
>>> pd.api.indexers.check_array_indexer(arr, mask)
Traceback (most recent call last):
...
ValueError: Cannot mask with a boolean indexer containing NA values
array([ True, False])
A numpy boolean mask will get passed through (if the length is correct):
Expand Down Expand Up @@ -425,10 +423,10 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:

dtype = indexer.dtype
if is_bool_dtype(dtype):
try:
if is_extension_array_dtype(dtype):
indexer = indexer.to_numpy(dtype=bool, na_value=False)
else:
indexer = np.asarray(indexer, dtype=bool)
except ValueError:
raise ValueError("Cannot mask with a boolean indexer containing NA values")

# GH26658
if len(indexer) != len(array):
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
is_iterator,
is_list_like,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_sequence,
)
Expand Down Expand Up @@ -2189,10 +2190,12 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
"the indexed object do not match)."
)
result = result.astype(bool)._values
else:
# key might be sparse / object-dtype bool, check_array_indexer needs bool array
elif is_object_dtype(key):
# key might be object-dtype bool, check_array_indexer needs bool array
result = np.asarray(result, dtype=bool)
result = check_array_indexer(index, result)
else:
result = check_array_indexer(index, result)

return result

Expand Down
9 changes: 6 additions & 3 deletions pandas/tests/arrays/categorical/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,14 +240,17 @@ def test_mask_with_boolean(index):


@pytest.mark.parametrize("index", [True, False])
def test_mask_with_boolean_raises(index):
def test_mask_with_boolean_na_treated_as_false(index):
# https://github.com/pandas-dev/pandas/issues/31503
s = Series(range(3))
idx = Categorical([True, False, None])
if index:
idx = CategoricalIndex(idx)

with pytest.raises(ValueError, match="NA / NaN"):
s[idx]
result = s[idx]
expected = s[idx.fillna(False)]

tm.assert_series_equal(result, expected)


@pytest.fixture
Expand Down
20 changes: 11 additions & 9 deletions pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,21 +158,23 @@ def test_getitem_boolean_array_mask(self, data):
result = pd.Series(data)[mask]
self.assert_series_equal(result, expected)

def test_getitem_boolean_array_mask_raises(self, data):
def test_getitem_boolean_na_treated_as_false(self, data):
# https://github.com/pandas-dev/pandas/issues/31503
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
mask[:2] = pd.NA
mask[2:4] = True

msg = (
"Cannot mask with a boolean indexer containing NA values|"
"cannot mask with array containing NA / NaN values"
)
with pytest.raises(ValueError, match=msg):
data[mask]
result = data[mask]
expected = data[mask.fillna(False)]

self.assert_extension_array_equal(result, expected)

s = pd.Series(data)

with pytest.raises(ValueError):
s[mask]
result = s[mask]
expected = s[mask.fillna(False)]

self.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"idx",
Expand Down
18 changes: 8 additions & 10 deletions pandas/tests/extension/base/setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
[
np.array([True, True, True, False, False]),
pd.array([True, True, True, False, False], dtype="boolean"),
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
],
ids=["numpy-array", "boolean-array"],
ids=["numpy-array", "boolean-array", "boolean-array-na"],
)
def test_setitem_mask(self, data, mask, box_in_series):
arr = data[:5].copy()
Expand All @@ -124,20 +125,17 @@ def test_setitem_mask_raises(self, data, box_in_series):
with pytest.raises(IndexError, match="wrong length"):
data[mask] = data[0]

def test_setitem_mask_boolean_array_raises(self, data, box_in_series):
# missing values in mask
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
mask[:2] = pd.NA
mask[:3] = True
mask[3:5] = pd.NA

if box_in_series:
data = pd.Series(data)

msg = (
"Cannot mask with a boolean indexer containing NA values|"
"cannot mask with array containing NA / NaN values"
)
with pytest.raises(ValueError, match=msg):
data[mask] = data[0]
data[mask] = data[0]

assert (data[:3] == data[0]).all()

@pytest.mark.parametrize(
"idx",
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,6 @@ def test_setitem_mask(self, data, mask, box_in_series):
def test_setitem_mask_raises(self, data, box_in_series):
super().test_setitem_mask_raises(data, box_in_series)

@skip_nested
def test_setitem_mask_boolean_array_raises(self, data, box_in_series):
super().test_setitem_mask_boolean_array_raises(data, box_in_series)

@skip_nested
@pytest.mark.parametrize(
"idx",
Expand Down
12 changes: 7 additions & 5 deletions pandas/tests/indexing/test_check_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,14 @@ def test_valid_input(indexer, expected):
@pytest.mark.parametrize(
"indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")],
)
def test_bool_raise_missing_values(indexer):
array = np.array([1, 2, 3])
def test_boolean_na_returns_indexer(indexer):
# https://github.com/pandas-dev/pandas/issues/31503
arr = np.array([1, 2, 3])

msg = "Cannot mask with a boolean indexer containing NA values"
with pytest.raises(ValueError, match=msg):
check_array_indexer(array, indexer)
result = check_array_indexer(arr, indexer)
expected = np.array([True, False, False], dtype=bool)

tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize(
Expand Down
27 changes: 19 additions & 8 deletions pandas/tests/indexing/test_na_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,18 +62,29 @@ def test_series_mask_boolean(values, dtype, mask, box_mask, frame):


@pytest.mark.parametrize("frame", [True, False])
def test_indexing_with_na_raises(frame):
def test_na_treated_as_false(frame):
# https://github.com/pandas-dev/pandas/issues/31503
s = pd.Series([1, 2, 3], name="name")

if frame:
s = s.to_frame()

mask = pd.array([True, False, None], dtype="boolean")
match = "cannot mask with array containing NA / NaN values"
with pytest.raises(ValueError, match=match):
s[mask]

with pytest.raises(ValueError, match=match):
s.loc[mask]
result = s[mask]
expected = s[mask.fillna(False)]

result_loc = s.loc[mask]
expected_loc = s.loc[mask.fillna(False)]

with pytest.raises(ValueError, match=match):
s.iloc[mask]
result_iloc = s.iloc[mask]
expected_iloc = s.iloc[mask.fillna(False)]

if frame:
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result_loc, expected_loc)
tm.assert_frame_equal(result_iloc, expected_iloc)
else:
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result_loc, expected_loc)
tm.assert_series_equal(result_iloc, expected_iloc)
2 changes: 1 addition & 1 deletion pandas/tests/series/indexing/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def test_getitem_boolean_object(string_series):

# nans raise exception
omask[5:10] = np.nan
msg = "cannot mask with array containing NA / NaN values"
msg = "Cannot mask with non-boolean array containing NA / NaN values"
with pytest.raises(ValueError, match=msg):
s[omask]
with pytest.raises(ValueError, match=msg):
Expand Down

0 comments on commit b9bcdc3

Please sign in to comment.