Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]: Indexing with BooleanArray propagates NA #30265

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.missing import isna, notna

from pandas.core import nanops, ops
Expand Down Expand Up @@ -294,6 +295,14 @@ def __getitem__(self, item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]
elif is_array_like(item) and is_bool_dtype(item.dtype):
if isinstance(item, BooleanArray):
# items, mask = item._data, item._mask
take = item._data | item._mask
result = self._data[take]
# output masked anywhere where self is masked, or the input was masked
omask = (self._mask | item._mask)[take]
return type(self)(result, omask)
return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
Expand Down
11 changes: 8 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from pandas.core.accessor import PandasDelegate, delegate_names
import pandas.core.algorithms as algorithms
from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d
from pandas.core.arrays import BooleanArray
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.construction import array, extract_array, sanitize_array
Expand Down Expand Up @@ -1996,10 +1997,14 @@ def __getitem__(self, key):
return np.nan
else:
return self.categories[i]
elif com.is_bool_indexer(key) and isinstance(key, BooleanArray):
take = key._data | key._mask
values = self._codes[take]
values[key._mask[take]] = -1
else:
return self._constructor(
values=self._codes[key], dtype=self.dtype, fastpath=True
)
values = self._codes[key]

return self._constructor(values=values, dtype=self.dtype, fastpath=True)

def __setitem__(self, key, value):
"""
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from pandas._typing import DatetimeLikeScalar
from pandas.core import missing, nanops
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
from pandas.core.arrays import BooleanArray
import pandas.core.common as com
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.ops.invalid import make_invalid_op
Expand Down Expand Up @@ -415,8 +416,15 @@ def __getitem__(self, key):
val = getitem(key)
return self._box_func(val)

mask = None
if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
if isinstance(key, BooleanArray):
# TODO: Handle all boolean indexers.
mask = key._mask
key = key._data | mask
mask = mask[key]
else:
key = np.asarray(key, dtype=bool)
if key.all():
key = slice(0, None, None)
else:
Expand All @@ -438,6 +446,10 @@ def __getitem__(self, key):
freq = self.freq

result = getitem(key)
if mask is not None:
# TODO: Check that we've copied!
result[mask] = iNaT

if result.ndim > 1:
# To support MPL which performs slicing with 2 dim
# even though it only has 1 dim by definition
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
is_scalar,
)
from pandas.core.dtypes.dtypes import register_extension_dtype
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.missing import isna, notna

from pandas.core import nanops, ops
Expand Down Expand Up @@ -366,10 +367,20 @@ def fmt(x):
return fmt

def __getitem__(self, item):
from pandas.core.arrays import BooleanArray

if is_integer(item):
if self._mask[item]:
return self.dtype.na_value
return self._data[item]
elif is_array_like(item) and is_bool_dtype(item.dtype):
if isinstance(item, BooleanArray):
# items, mask = item._data, item._mask
take = item._data | item._mask
result = self._data[take]
# output masked anywhere where self is masked, or the input was masked
omask = (self._mask | item._mask)[take]
return type(self)(result, omask)
return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self):
Expand Down
16 changes: 15 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

from pandas import compat
from pandas.core import ops
from pandas.core.arrays import PandasArray
from pandas.core.arrays import BooleanArray, PandasArray
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.missing import isna

Expand Down Expand Up @@ -232,6 +233,19 @@ def __setitem__(self, key, value):

super().__setitem__(key, value)

def __getitem__(self, item):
# Doing this here, as PandasArray.__getitem__ can't guarantee dtype stability
# when getting with a boolean mask.
if com.is_bool_indexer(item):
if isinstance(item, BooleanArray):
# items, mask = item._data, item._mask
take = item._data | item._mask
result = self[take]
# That copies, right?
result[item._mask[take]] = self.dtype.na_value
return result
return super().__getitem__(item)

def fillna(self, value=None, method=None, limit=None):
# TODO: validate dtype
return super().fillna(value, method, limit)
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ def is_bool_indexer(key: Any) -> bool:
elif is_bool_dtype(key.dtype):
# an ndarray with bool-dtype by definition has no missing values.
# So we only need to check for NAs in ExtensionArrays
if is_extension_array_dtype(key.dtype):
if np.any(key.isna()):
raise ValueError(na_msg)
# if is_extension_array_dtype(key.dtype):
# if np.any(key.isna()):
# raise ValueError(na_msg)
return True
elif isinstance(key, list):
try:
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
from pandas.core import ops
from pandas.core.accessor import CachedAccessor
import pandas.core.algorithms as algos
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays import BooleanArray, ExtensionArray
from pandas.core.base import IndexOpsMixin, PandasObject
import pandas.core.common as com
from pandas.core.construction import extract_array
Expand Down Expand Up @@ -4014,7 +4014,11 @@ def __getitem__(self, key):
return promote(getitem(key))

if com.is_bool_indexer(key):
key = np.asarray(key, dtype=bool)
if isinstance(key, BooleanArray):
# TODO: Handle all boolean indexers.
key = key._data | key._mask
else:
key = np.asarray(key, dtype=bool)

key = com.values_from_object(key)
result = getitem(key)
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pandas.util._decorators import Appender

from pandas.core.dtypes.common import (
is_extension_array_dtype,
is_float,
is_integer,
is_iterator,
Expand All @@ -19,6 +20,7 @@
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries
from pandas.core.dtypes.inference import is_array_like
from pandas.core.dtypes.missing import _infer_fill_value, isna

import pandas.core.common as com
Expand Down Expand Up @@ -2309,6 +2311,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
If the index of the key is unalignable to index.
"""
result = key
mask = None
if isinstance(key, ABCSeries) and not key.index.equals(index):
result = result.reindex(index)
mask = isna(result._values)
Expand All @@ -2319,6 +2322,9 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
"the indexed object do not match)."
)
result = result.astype(bool)._values
elif is_array_like(result) and is_extension_array_dtype(result.dtype):
assert result.dtype.kind == "b"
mask = isna(result)
else:
if is_sparse(result):
result = result.to_dense()
Expand All @@ -2330,7 +2336,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray:
"Item wrong length {} instead of {}.".format(len(result), len(index))
)

return result
return result, mask


def convert_missing_indexer(indexer):
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def __setstate__(self, state):
self.values = state[1]
self.ndim = self.values.ndim

def _slice(self, slicer):
def _slice(self, slicer, mask=None):
""" return a slice of my values """
return self.values[slicer]

Expand Down Expand Up @@ -1810,7 +1810,7 @@ def _can_hold_element(self, element: Any) -> bool:
# We're doing the same as CategoricalBlock here.
return True

def _slice(self, slicer):
def _slice(self, slicer, mask=None):
""" return a slice of my values """

# slice the category
Expand Down Expand Up @@ -2307,7 +2307,7 @@ def to_dense(self):
# expects that behavior.
return np.asarray(self.values, dtype=_NS_DTYPE)

def _slice(self, slicer):
def _slice(self, slicer, mask=None):
""" return a slice of my values """
if isinstance(slicer, tuple):
col, loc = slicer
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ def combine(self, blocks, copy=True):

return type(self)(new_blocks, axes, do_integrity_check=False)

def get_slice(self, slobj, axis=0):
def get_slice(self, slobj, axis=0, mask=None):
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")

Expand Down Expand Up @@ -1505,11 +1505,13 @@ def _blklocs(self):
""" compat with BlockManager """
return None

def get_slice(self, slobj, axis=0):
def get_slice(self, slobj, axis=0, mask=None):
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")

return type(self)(self._block._slice(slobj), self.index[slobj], fastpath=True)
return type(self)(
self._block._slice(slobj, mask=mask), self.index[slobj], fastpath=True
)

@property
def index(self):
Expand Down
17 changes: 10 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,11 +892,13 @@ def __getitem__(self, key):
key = list(key)

if com.is_bool_indexer(key):
key = check_bool_indexer(self.index, key)
key, mask = check_bool_indexer(self.index, key)
else:
mask = None

return self._get_with(key)
return self._get_with(key, mask)

def _get_with(self, key):
def _get_with(self, key, mask=None):
# other: fancy integer or otherwise
if isinstance(key, slice):
return self._slice(key)
Expand All @@ -917,12 +919,13 @@ def _get_with(self, key):
return self._get_values(key)
raise

if not isinstance(key, (list, np.ndarray, Series, Index)):
if not is_list_like(key):
key = list(key)

if isinstance(key, Index):
key_type = key.inferred_type
else:
# TODO: why not use key.dtype?
key_type = lib.infer_dtype(key, skipna=False)

if key_type == "integer":
Expand All @@ -931,7 +934,7 @@ def _get_with(self, key):
else:
return self._get_values(key)
elif key_type == "boolean":
return self._get_values(key)
return self._get_values(key, mask)

if isinstance(key, (list, tuple)):
# TODO: de-dup with tuple case handled above?
Expand Down Expand Up @@ -960,10 +963,10 @@ def _get_values_tuple(self, key):
self
)

def _get_values(self, indexer):
def _get_values(self, indexer, mask=None):
try:
return self._constructor(
self._data.get_slice(indexer), fastpath=True
self._data.get_slice(indexer, mask=mask), fastpath=True
).__finalize__(self)
except ValueError:
# mpl compat if we look up e.g. ser[:, np.newaxis];
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2013,6 +2013,7 @@ def test_index_single_double_tuples(self, tpl):
tm.assert_frame_equal(result, expected)

def test_boolean_indexing(self):
# TODO: parametrize
idx = list(range(3))
cols = ["A", "B", "C"]
df1 = DataFrame(
Expand All @@ -2036,6 +2037,7 @@ def test_boolean_indexing(self):
df1[df1.index[:-1] > 2] = -1

def test_boolean_indexing_mixed(self):
# TODO: parametrize?
df = DataFrame(
{
0: {35: np.nan, 40: np.nan, 43: np.nan, 49: np.nan, 50: np.nan},
Expand Down
Loading