Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: generalized check_array_indexer for validating array-like getitem indexers #31150

Merged
merged 20 commits into from
Jan 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
e8f539a
API: generalized check_array_indexer for validating array-like indexers
jorisvandenbossche Jan 20, 2020
4fa9f5a
test boolean message as well
jorisvandenbossche Jan 20, 2020
b55dfd2
fixes for failing tests
jorisvandenbossche Jan 20, 2020
095b741
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
jorisvandenbossche Jan 22, 2020
58bfe78
remove previous check_bool_array_indexer
jorisvandenbossche Jan 22, 2020
5ce8d85
don't convert tuples to avoid warning from numpy
jorisvandenbossche Jan 22, 2020
ebc2150
ensure check_bool_indexer returns numpy array
jorisvandenbossche Jan 22, 2020
4a51d97
raise warning for categorical
jorisvandenbossche Jan 22, 2020
50490aa
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
jorisvandenbossche Jan 24, 2020
c979df8
move deprecate_ndim_indexing
jorisvandenbossche Jan 24, 2020
ce2e042
cleanup; ensure output of check_array_indexer is always an ndarray
jorisvandenbossche Jan 24, 2020
4d447bf
clean-up black reformatting
jorisvandenbossche Jan 24, 2020
d930e84
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
jorisvandenbossche Jan 27, 2020
9ed8fe9
fix check_bool_indexer
jorisvandenbossche Jan 28, 2020
2f8cd27
add comment to check_bool_indexer
jorisvandenbossche Jan 28, 2020
4d9a201
fix empty list case
jorisvandenbossche Jan 28, 2020
097d221
add specific tests for check_array_indexer
jorisvandenbossche Jan 28, 2020
3c5e4c6
allow list-length-1-with-slice corner case
jorisvandenbossche Jan 28, 2020
1ca35d1
move list-like check inside
jorisvandenbossche Jan 28, 2020
e5ea9b4
Merge remote-tracking branch 'upstream/master' into EA-check-indexer
TomAugspurger Jan 28, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/reference/extensions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ behaves correctly.
.. autosummary::
:toctree: api/

api.indexers.check_bool_array_indexer
api.indexers.check_array_indexer


The sentinel ``pandas.api.extensions.no_default`` is used as the default
Expand Down
4 changes: 2 additions & 2 deletions pandas/api/indexers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Public API for Rolling Window Indexers.
"""

from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer
from pandas.core.window.indexers import BaseIndexer

__all__ = ["check_bool_array_indexer", "BaseIndexer"]
__all__ = ["check_array_indexer", "BaseIndexer"]
11 changes: 4 additions & 7 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
from pandas.core.dtypes.inference import is_array_like, is_hashable
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna

from pandas.core import ops
Expand All @@ -54,7 +54,7 @@
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.construction import array, extract_array, sanitize_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing
from pandas.core.missing import interpolate_2d
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.sorting import nargsort
Expand Down Expand Up @@ -2001,14 +2001,11 @@ def __getitem__(self, key):
else:
return self.categories[i]

if is_list_like(key) and not is_array_like(key):
key = np.asarray(key)

if com.is_bool_indexer(key):
key = check_bool_array_indexer(self, key)
key = check_array_indexer(self, key)

result = self._codes[key]
if result.ndim > 1:
deprecate_ndim_indexing(result)
return result
return self._constructor(result, dtype=self.dtype, fastpath=True)

Expand Down
13 changes: 11 additions & 2 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.ops.invalid import invalid_comparison, make_invalid_op

Expand Down Expand Up @@ -518,11 +518,20 @@ def __getitem__(self, key):
return type(self)(val, dtype=self.dtype)

if com.is_bool_indexer(key):
key = check_bool_array_indexer(self, key)
# first convert to boolean, because check_array_indexer doesn't
# allow object dtype
key = np.asarray(key, dtype=bool)
key = check_array_indexer(self, key)
if key.all():
key = slice(0, None, None)
else:
key = lib.maybe_booleans_to_slice(key.view(np.uint8))
elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice):
# see https://github.com/pandas-dev/pandas/issues/31299, need to allow
# this for now (would otherwise raise in check_array_indexer)
pass
else:
key = check_array_indexer(self, key)

is_period = is_period_dtype(self)
if is_period:
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from pandas.core.arrays.categorical import Categorical
import pandas.core.common as com
from pandas.core.construction import array
from pandas.core.indexers import check_array_indexer
from pandas.core.indexes.base import ensure_index

_VALID_CLOSED = {"left", "right", "both", "neither"}
Expand Down Expand Up @@ -495,6 +496,7 @@ def __len__(self) -> int:
return len(self.left)

def __getitem__(self, value):
value = check_array_indexer(self, value)
left = self.left[value]
right = self.right[value]

Expand Down
6 changes: 2 additions & 4 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

from pandas.core.algorithms import take
from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer

if TYPE_CHECKING:
from pandas._typing import Scalar
Expand All @@ -35,8 +34,7 @@ def __getitem__(self, item):
return self.dtype.na_value
return self._data[item]

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)
item = check_array_indexer(self, item)

return type(self)(self._data[item], self._mask[item])

Expand Down
6 changes: 2 additions & 4 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
from pandas.core import nanops
from pandas.core.algorithms import searchsorted, take, unique
from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin
import pandas.core.common as com
from pandas.core.construction import extract_array
from pandas.core.indexers import check_bool_array_indexer
from pandas.core.indexers import check_array_indexer
from pandas.core.missing import backfill_1d, pad_1d


Expand Down Expand Up @@ -234,8 +233,7 @@ def __getitem__(self, item):
if isinstance(item, type(self)):
item = item._ndarray

elif com.is_bool_indexer(item):
item = check_bool_array_indexer(self, item)
item = check_array_indexer(self, item)

result = self._ndarray[item]
if not lib.is_scalar(item):
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.construction import sanitize_array
from pandas.core.indexers import check_array_indexer
from pandas.core.missing import interpolate_2d
import pandas.core.ops as ops
from pandas.core.ops.common import unpack_zerodim_and_defer
Expand Down Expand Up @@ -768,6 +769,8 @@ def __getitem__(self, key):
else:
key = np.asarray(key)

key = check_array_indexer(self, key)

if com.is_bool_indexer(key):
key = check_bool_indexer(self, key)

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def is_bool_indexer(key: Any) -> bool:

See Also
--------
check_bool_array_indexer : Check that `key`
is a valid mask for an array, and convert to an ndarray.
check_array_indexer : Check that `key` is a valid array to index,
and convert to an ndarray.
"""
na_msg = "cannot mask with array containing NA / NaN values"
if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (
Expand Down
153 changes: 130 additions & 23 deletions pandas/core/indexers.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
"""
Low-dependency indexing utilities.
"""
import warnings

import numpy as np

from pandas._typing import AnyArrayLike
from pandas._typing import Any, AnyArrayLike

from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.common import (
is_array_like,
is_bool_dtype,
is_integer_dtype,
is_list_like,
)
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries

# -----------------------------------------------------------
Expand Down Expand Up @@ -244,66 +251,166 @@ def length_of_indexer(indexer, target=None) -> int:
raise AssertionError("cannot find the length of the indexer")


def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray:
def deprecate_ndim_indexing(result):
"""
Helper function to raise the deprecation warning for multi-dimensional
indexing on 1D Series/Index.

GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
and keep an index, so we currently return ndarray, which is deprecated
(Deprecation GH#30588).
"""
Check if `mask` is a valid boolean indexer for `array`.
if np.ndim(result) > 1:
warnings.warn(
"Support for multi-dimensional indexing (e.g. `index[:, None]`) "
"on an Index is deprecated and will be removed in a future "
"version. Convert to a numpy array before indexing instead.",
DeprecationWarning,
stacklevel=3,
)


# -----------------------------------------------------------
# Public indexer validation

`array` and `mask` are checked to have the same length, and the
dtype is validated.

def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
"""
Check if `indexer` is a valid array indexer for `array`.

For a boolean mask, `array` and `indexer` are checked to have the same
length. The dtype is validated, and if it is an integer or boolean
ExtensionArray, it is checked if there are missing values present, and
it is converted to the appropriate numpy array. Other dtypes will raise
an error.

Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
through as is.

.. versionadded:: 1.0.0

Parameters
----------
array : array
The array that's being masked.
mask : array
The boolean array that's masking.
array : array-like
The array that is being indexed (only used for the length).
indexer : array-like or list-like
The array-like that's used to index. List-like input that is not yet
a numpy array or an ExtensionArray is converted to one. Other input
types are passed through as is

Returns
-------
numpy.ndarray
The validated boolean mask.
The validated indexer as a numpy array that can be used to index.

Raises
------
IndexError
When the lengths don't match.
ValueError
When `mask` cannot be converted to a bool-dtype ndarray.
When `indexer` cannot be converted to a numpy ndarray to index
(e.g. presence of missing values).

See Also
--------
api.types.is_bool_dtype : Check if `key` is of boolean dtype.

Examples
--------
A boolean ndarray is returned when the arguments are all valid.
When checking a boolean mask, a boolean ndarray is returned when the
arguments are all valid.

>>> mask = pd.array([True, False])
>>> arr = pd.array([1, 2])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
>>> pd.api.indexers.check_array_indexer(arr, mask)
array([ True, False])

An IndexError is raised when the lengths don't match.

>>> mask = pd.array([True, False, True])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
>>> pd.api.indexers.check_array_indexer(arr, mask)
Traceback (most recent call last):
...
IndexError: Item wrong length 3 instead of 2.
IndexError: Boolean index has wrong length: 3 instead of 2.

A ValueError is raised when the mask cannot be converted to
a bool-dtype ndarray.

>>> mask = pd.array([True, pd.NA])
>>> pd.api.extensions.check_bool_array_indexer(arr, mask)
>>> pd.api.indexers.check_array_indexer(arr, mask)
Traceback (most recent call last):
...
ValueError: Cannot mask with a boolean indexer containing NA values

A numpy boolean mask will get passed through (if the length is correct):

>>> mask = np.array([True, False])
>>> pd.api.indexers.check_array_indexer(arr, mask)
array([ True, False])

Similarly for integer indexers, an integer ndarray is returned when it is
a valid indexer, otherwise an error is (for integer indexers, a matching
length is not required):

>>> indexer = pd.array([0, 2], dtype="Int64")
>>> arr = pd.array([1, 2, 3])
>>> pd.api.indexers.check_array_indexer(arr, indexer)
array([0, 2])

>>> indexer = pd.array([0, pd.NA], dtype="Int64")
jreback marked this conversation as resolved.
Show resolved Hide resolved
>>> pd.api.indexers.check_array_indexer(arr, indexer)
Traceback (most recent call last):
...
ValueError: Cannot index with an integer indexer containing NA values

For non-integer/boolean dtypes, an appropriate error is raised:

>>> indexer = np.array([0., 2.], dtype="float64")
>>> pd.api.indexers.check_array_indexer(arr, indexer)
Traceback (most recent call last):
...
ValueError: cannot convert to bool numpy array in presence of missing values
IndexError: arrays used as indices must be of integer or boolean type
"""
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
result = np.asarray(mask, dtype=bool)
# GH26658
if len(result) != len(array):
raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.")
return result
from pandas.core.construction import array as pd_array

# whathever is not an array-like is returned as-is (possible valid array
# indexers that are not array-like: integer, slice, Ellipsis, None)
# In this context, tuples are not considered as array-like, as they have
# a specific meaning in indexing (multi-dimensional indexing)
if is_list_like(indexer):
if isinstance(indexer, tuple):
return indexer
else:
return indexer

# convert list-likes to array
if not is_array_like(indexer):
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
indexer = pd_array(indexer)
if len(indexer) == 0:
# empty list is converted to float array by pd.array
indexer = np.array([], dtype=np.intp)

dtype = indexer.dtype
if is_bool_dtype(dtype):
try:
indexer = np.asarray(indexer, dtype=bool)
except ValueError:
raise ValueError("Cannot mask with a boolean indexer containing NA values")

# GH26658
if len(indexer) != len(array):
raise IndexError(
f"Boolean index has wrong length: "
f"{len(indexer)} instead of {len(array)}"
)
elif is_integer_dtype(dtype):
jreback marked this conversation as resolved.
Show resolved Hide resolved
try:
indexer = np.asarray(indexer, dtype=np.intp)
except ValueError:
raise ValueError(
"Cannot index with an integer indexer containing NA values"
)
else:
raise IndexError("arrays used as indices must be of integer or boolean type")

return indexer
Loading