Backport PR #31591: ENH: Enable indexing with nullable Boolean (#32192)

pandas-dev · Feb 23, 2020 · 1bc1d59 · 1bc1d59
1 parent eed4bd2
commit 1bc1d59
Show file tree

Hide file tree

Showing 14 changed files with 227 additions and 52 deletions.
diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst
@@ -20,8 +20,9 @@ Nullable Boolean Data Type
 Indexing with NA values
 -----------------------
 
-pandas does not allow indexing with NA values. Attempting to do so
-will raise a ``ValueError``.
+pandas allows indexing with ``NA`` values in a boolean array, which are treated as ``False``.
+
+.. versionchanged:: 1.0.2
 
 .. ipython:: python
    :okexcept:
@@ -30,12 +31,11 @@ will raise a ``ValueError``.
    mask = pd.array([True, False, pd.NA], dtype="boolean")
    s[mask]
 
-The missing values will need to be explicitly filled with True or False prior
-to using the array as a mask.
+If you would prefer to keep the ``NA`` values you can manually fill them with ``fillna(True)``.
 
 .. ipython:: python
 
-   s[mask.fillna(False)]
+   s[mask.fillna(True)]
 
 .. _boolean.kleene:
 

diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
@@ -59,7 +59,7 @@ of multi-axis indexing.
       slices, **both** the start and the stop are included, when present in the
       index! See :ref:`Slicing with labels <indexing.slicing_with_labels>`
       and :ref:`Endpoints are inclusive <advanced.endpoints_are_inclusive>`.)
-    * A boolean array
+    * A boolean array (any ``NA`` values will be treated as ``False``).
     * A ``callable`` function with one argument (the calling Series or DataFrame) and
       that returns valid output for indexing (one of the above).
 
@@ -75,7 +75,7 @@ of multi-axis indexing.
     * An integer e.g. ``5``.
     * A list or array of integers ``[4, 3, 0]``.
     * A slice object with ints ``1:7``.
-    * A boolean array.
+    * A boolean array (any ``NA`` values will be treated as ``False``).
     * A ``callable`` function with one argument (the calling Series or DataFrame) and
       that returns valid output for indexing (one of the above).
 
@@ -374,6 +374,14 @@ For getting values with a boolean array:
    df1.loc['a'] > 0
    df1.loc[:, df1.loc['a'] > 0]
 
+NA values in a boolean array propogate as ``False``:
+
+.. versionchanged:: 1.0.2
+
+   mask = pd.array([True, False, True, False, pd.NA, False], dtype="boolean")
+   mask
+   df1[mask]
+
 For getting a value explicitly:
 
 .. ipython:: python

diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
@@ -26,6 +26,33 @@ Fixed regressions
 
 .. ---------------------------------------------------------------------------
 
+Indexing with Nullable Boolean Arrays
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`)
+
+.. ipython:: python
+
+    s = pd.Series([1, 2, 3, 4])
+    mask = pd.array([True, True, False, None], dtype="boolean")
+    s
+    mask
+
+*pandas 1.0.0-1.0.1*
+
+.. code-block:: python
+
+    >>> s[mask]
+    Traceback (most recent call last):
+    ...
+    ValueError: cannot mask with array containing NA / NaN values
+
+*pandas 1.0.2*
+
+.. ipython:: python
+
+    s[mask]
+
 .. _whatsnew_102.bug_fixes:
 
 Bug fixes
@@ -45,8 +72,6 @@ Bug fixes
 - Using ``pd.NA`` with :meth:`DataFrame.to_json` now correctly outputs a null value instead of an empty object (:issue:`31615`)
 - Fixed bug in parquet roundtrip with nullable unsigned integer dtypes (:issue:`31896`).
 
-
-
 **Experimental dtypes**
 
 - Fix bug in :meth:`DataFrame.convert_dtypes` for columns that were already using the ``"string"`` dtype (:issue:`31731`).

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -520,7 +520,9 @@ def __getitem__(self, key):
         if com.is_bool_indexer(key):
             # first convert to boolean, because check_array_indexer doesn't
             # allow object dtype
-            key = np.asarray(key, dtype=bool)
+            if is_object_dtype(key):
+                key = np.asarray(key, dtype=bool)
+
             key = check_array_indexer(self, key)
             if key.all():
                 key = slice(0, None, None)

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -124,24 +124,19 @@ def is_bool_indexer(key: Any) -> bool:
     check_array_indexer : Check that `key` is a valid array to index,
         and convert to an ndarray.
     """
-    na_msg = "cannot mask with array containing NA / NaN values"
     if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
         is_array_like(key) and is_extension_array_dtype(key.dtype)
     ):
         if key.dtype == np.object_:
             key = np.asarray(values_from_object(key))
 
             if not lib.is_bool_array(key):
+                na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
                 if isna(key).any():
                     raise ValueError(na_msg)
                 return False
             return True
         elif is_bool_dtype(key.dtype):
-            # an ndarray with bool-dtype by definition has no missing values.
-            # So we only need to check for NAs in ExtensionArrays
-            if is_extension_array_dtype(key.dtype):
-                if np.any(key.isna()):
-                    raise ValueError(na_msg)
             return True
     elif isinstance(key, list):
         try:

diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py
@@ -10,6 +10,7 @@
 from pandas.core.dtypes.common import (
     is_array_like,
     is_bool_dtype,
+    is_extension_array_dtype,
     is_integer_dtype,
     is_list_like,
 )
@@ -333,14 +334,11 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
     ...
     IndexError: Boolean index has wrong length: 3 instead of 2.
 
-    A ValueError is raised when the mask cannot be converted to
-    a bool-dtype ndarray.
+    NA values in a boolean array are treated as False.
 
     >>> mask = pd.array([True, pd.NA])
     >>> pd.api.indexers.check_array_indexer(arr, mask)
-    Traceback (most recent call last):
-    ...
-    ValueError: Cannot mask with a boolean indexer containing NA values
+    array([ True, False])
 
     A numpy boolean mask will get passed through (if the length is correct):
 
@@ -392,10 +390,10 @@ def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
 
     dtype = indexer.dtype
     if is_bool_dtype(dtype):
-        try:
+        if is_extension_array_dtype(dtype):
+            indexer = indexer.to_numpy(dtype=bool, na_value=False)
+        else:
             indexer = np.asarray(indexer, dtype=bool)
-        except ValueError:
-            raise ValueError("Cannot mask with a boolean indexer containing NA values")
 
         # GH26658
         if len(indexer) != len(array):

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -13,6 +13,7 @@
     is_iterator,
     is_list_like,
     is_numeric_dtype,
+    is_object_dtype,
     is_scalar,
     is_sequence,
 )
@@ -2319,10 +2320,12 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
                 "the indexed object do not match)."
             )
         result = result.astype(bool)._values
-    else:
-        # key might be sparse / object-dtype bool, check_array_indexer needs bool array
+    elif is_object_dtype(key):
+        # key might be object-dtype bool, check_array_indexer needs bool array
         result = np.asarray(result, dtype=bool)
         result = check_array_indexer(index, result)
+    else:
+        result = check_array_indexer(index, result)
 
     return result
 

diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py
@@ -240,14 +240,17 @@ def test_mask_with_boolean(index):
 
 
 @pytest.mark.parametrize("index", [True, False])
-def test_mask_with_boolean_raises(index):
+def test_mask_with_boolean_na_treated_as_false(index):
+    # https://github.com/pandas-dev/pandas/issues/31503
     s = Series(range(3))
     idx = Categorical([True, False, None])
     if index:
         idx = CategoricalIndex(idx)
 
-    with pytest.raises(ValueError, match="NA / NaN"):
-        s[idx]
+    result = s[idx]
+    expected = s[idx.fillna(False)]
+
+    tm.assert_series_equal(result, expected)
 
 
 @pytest.fixture

diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
@@ -158,21 +158,23 @@ def test_getitem_boolean_array_mask(self, data):
         result = pd.Series(data)[mask]
         self.assert_series_equal(result, expected)
 
-    def test_getitem_boolean_array_mask_raises(self, data):
+    def test_getitem_boolean_na_treated_as_false(self, data):
+        # https://github.com/pandas-dev/pandas/issues/31503
         mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
         mask[:2] = pd.NA
+        mask[2:4] = True
 
-        msg = (
-            "Cannot mask with a boolean indexer containing NA values|"
-            "cannot mask with array containing NA / NaN values"
-        )
-        with pytest.raises(ValueError, match=msg):
-            data[mask]
+        result = data[mask]
+        expected = data[mask.fillna(False)]
+
+        self.assert_extension_array_equal(result, expected)
 
         s = pd.Series(data)
 
-        with pytest.raises(ValueError):
-            s[mask]
+        result = s[mask]
+        expected = s[mask.fillna(False)]
+
+        self.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
         "idx",

diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py
@@ -93,6 +93,90 @@ def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
         df.iloc[10, 1] = data[1]
         assert df.loc[10, "B"] == data[1]
 
+    @pytest.mark.parametrize(
+        "mask",
+        [
+            np.array([True, True, True, False, False]),
+            pd.array([True, True, True, False, False], dtype="boolean"),
+            pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
+        ],
+        ids=["numpy-array", "boolean-array", "boolean-array-na"],
+    )
+    def test_setitem_mask(self, data, mask, box_in_series):
+        arr = data[:5].copy()
+        expected = arr.take([0, 0, 0, 3, 4])
+        if box_in_series:
+            arr = pd.Series(arr)
+            expected = pd.Series(expected)
+        arr[mask] = data[0]
+        self.assert_equal(expected, arr)
+
+    def test_setitem_mask_raises(self, data, box_in_series):
+        # wrong length
+        mask = np.array([True, False])
+
+        if box_in_series:
+            data = pd.Series(data)
+
+        with pytest.raises(IndexError, match="wrong length"):
+            data[mask] = data[0]
+
+        mask = pd.array(mask, dtype="boolean")
+        with pytest.raises(IndexError, match="wrong length"):
+            data[mask] = data[0]
+
+    def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
+        mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean")
+        mask[:3] = True
+        mask[3:5] = pd.NA
+
+        if box_in_series:
+            data = pd.Series(data)
+
+        data[mask] = data[0]
+
+        assert (data[:3] == data[0]).all()
+
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
+        ids=["list", "integer-array", "numpy-array"],
+    )
+    def test_setitem_integer_array(self, data, idx, box_in_series):
+        arr = data[:5].copy()
+        expected = data.take([0, 0, 0, 3, 4])
+
+        if box_in_series:
+            arr = pd.Series(arr)
+            expected = pd.Series(expected)
+
+        arr[idx] = arr[0]
+        self.assert_equal(arr, expected)
+
+    @pytest.mark.parametrize(
+        "idx, box_in_series",
+        [
+            ([0, 1, 2, pd.NA], False),
+            pytest.param(
+                [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
+            ),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
+        ],
+        ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
+    )
+    def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
+        arr = data.copy()
+
+        # TODO(xfail) this raises KeyError about labels not found (it tries label-based)
+        # for list of labels with Series
+        if box_in_series:
+            arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
+
+        msg = "Cannot index with an integer indexer containing NA values"
+        with pytest.raises(ValueError, match=msg):
+            arr[idx] = arr[0]
+
     @pytest.mark.parametrize("as_callable", [True, False])
     @pytest.mark.parametrize("setter", ["loc", None])
     def test_setitem_mask_aligned(self, data, as_callable, setter):

diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
@@ -396,6 +396,48 @@ def test_setitem_scalar_key_sequence_raise(self, data):
         # Failed: DID NOT RAISE <class 'ValueError'>
         super().test_setitem_scalar_key_sequence_raise(data)
 
+    # TODO: there is some issue with PandasArray, therefore,
+    #   skip the setitem test for now, and fix it later (GH 31446)
+
+    @skip_nested
+    @pytest.mark.parametrize(
+        "mask",
+        [
+            np.array([True, True, True, False, False]),
+            pd.array([True, True, True, False, False], dtype="boolean"),
+        ],
+        ids=["numpy-array", "boolean-array"],
+    )
+    def test_setitem_mask(self, data, mask, box_in_series):
+        super().test_setitem_mask(data, mask, box_in_series)
+
+    @skip_nested
+    def test_setitem_mask_raises(self, data, box_in_series):
+        super().test_setitem_mask_raises(data, box_in_series)
+
+    @skip_nested
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
+        ids=["list", "integer-array", "numpy-array"],
+    )
+    def test_setitem_integer_array(self, data, idx, box_in_series):
+        super().test_setitem_integer_array(data, idx, box_in_series)
+
+    @skip_nested
+    @pytest.mark.parametrize(
+        "idx, box_in_series",
+        [
+            ([0, 1, 2, pd.NA], False),
+            pytest.param([0, 1, 2, pd.NA], True, marks=pytest.mark.xfail),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
+        ],
+        ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
+    )
+    def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
+        super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
+
     @skip_nested
     def test_setitem_slice(self, data, box_in_series):
         super().test_setitem_slice(data, box_in_series)