Skip to content

Commit

Permalink
FIX-#3686: optimize __getitem__ flow of loc/iloc (#3694)
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev committed Dec 1, 2021
1 parent f79cb85 commit 0947ee8
Show file tree
Hide file tree
Showing 4 changed files with 270 additions and 98 deletions.
7 changes: 6 additions & 1 deletion modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,12 @@ def _get_dict_of_block_index(self, axis, indices):
if isinstance(indices, slice) or (is_range_like(indices) and indices.step == 1):
# Converting range-like indexer to slice
indices = slice(indices.start, indices.stop, indices.step)
if indices == slice(None) or indices == slice(0, None):
# Detecting full-axis grab
if (
indices.start in (None, 0)
and indices.step in (None, 1)
and (indices.stop is None or indices.stop >= len(self.axes[axis]))
):
return OrderedDict(
zip(
range(self._partitions.shape[axis]),
Expand Down
273 changes: 204 additions & 69 deletions modin/pandas/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@

import numpy as np
import pandas
import itertools
from pandas.api.types import is_list_like, is_bool
from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.common import is_integer, is_bool_dtype, is_integer_dtype
from pandas.core.indexing import IndexingError
from modin.error_message import ErrorMessage

Expand Down Expand Up @@ -126,9 +127,34 @@ def is_boolean_array(x):
bool
True if argument is an array of bool, False otherwise.
"""
if isinstance(x, (np.ndarray, Series, pandas.Series, pandas.Index)):
return is_bool_dtype(x.dtype)
elif isinstance(x, (DataFrame, pandas.DataFrame)):
return all(map(is_bool_dtype, x.dtypes))
return is_list_like(x) and all(map(is_bool, x))


def is_integer_array(x):
"""
Check that argument is an array of integers.
Parameters
----------
x : object
Object to check.
Returns
-------
bool
True if argument is an array of integers, False otherwise.
"""
if isinstance(x, (np.ndarray, Series, pandas.Series, pandas.Index)):
return is_integer_dtype(x.dtype)
elif isinstance(x, (DataFrame, pandas.DataFrame)):
return all(map(is_integer_dtype, x.dtypes))
return is_list_like(x) and all(map(is_integer, x))


def is_integer_slice(x):
"""
Check that argument is an array of int.
Expand Down Expand Up @@ -175,6 +201,32 @@ def is_range_like(obj):
)


def boolean_mask_to_numeric(indexer):
"""
Convert boolean mask to numeric indices.
Parameters
----------
indexer : list-like of booleans
Returns
-------
np.ndarray of ints
Numerical positions of ``True`` elements in the passed `indexer`.
"""
if isinstance(indexer, (np.ndarray, Series, pandas.Series)):
return np.where(indexer)[0]
else:
# It's faster to build the resulting numpy array from the reduced amount of data via
# `compress` iterator than convert non-numpy-like `indexer` to numpy and apply `np.where`.
return np.fromiter(
# `itertools.compress` masks `data` with the `selectors` mask,
# works about ~10% faster than a pure list comprehension
itertools.compress(data=range(len(indexer)), selectors=indexer),
dtype=np.int64,
)


_ILOC_INT_ONLY_ERROR = """
Location based indexing can only have [integer, integer slice (START point is
INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types.
Expand Down Expand Up @@ -505,6 +557,43 @@ def _parse_row_and_column_locators(self, tup):
col_loc = col_loc(self.df) if callable(col_loc) else col_loc
return row_loc, col_loc, _compute_ndim(row_loc, col_loc)

# HACK: This method bypasses regular ``loc/iloc.__getitem__`` flow in order to ensure better
# performance in the case of boolean masking. The only purpose of this method is to compensate
# for a lack of backend's indexing API, there is no Query Compiler method allowing masking
# along both axis when any of the indexers is a boolean. That's why rows and columns masking
# phases are separate in this case.
# TODO: Remove this method and handle this case naturally via ``loc/iloc.__getitem__`` flow
# when QC API would support both-axis masking with boolean indexers.
def _handle_boolean_masking(self, row_loc, col_loc):
"""
Retrieve dataset according to the boolean mask for rows and an indexer for columns.
In comparison with the regular ``loc/iloc.__getitem__`` flow this method efficiently
masks rows with a Modin Series boolean mask without materializing it (if the selected
execution implements such masking).
Parameters
----------
row_loc : modin.pandas.Series of bool dtype
Boolean mask to index rows with.
col_loc : object
An indexer along column axis.
Returns
-------
modin.pandas.DataFrame or modin.pandas.Series
Located dataset.
"""
ErrorMessage.catch_bugs_and_request_email(
failure_condition=not isinstance(row_loc, Series),
extra_log=f"Only ``modin.pandas.Series`` boolean masks are acceptable, got: {type(row_loc)}",
)
masked_df = self.df.__constructor__(
query_compiler=self.qc.getitem_array(row_loc._query_compiler)
)
# Passing `slice(None)` as a row indexer since we've just applied it
return type(self)(masked_df)[(slice(None), col_loc)]


class _LocIndexer(_LocationIndexerBase):
"""
Expand Down Expand Up @@ -539,32 +628,22 @@ def __getitem__(self, key):
row_loc, col_loc, ndim = self._parse_row_and_column_locators(key)
self.row_scalar = is_scalar(row_loc)
self.col_scalar = is_scalar(col_loc)
if isinstance(row_loc, slice) and row_loc == slice(None):
# If we're only slicing columns, handle the case with `__getitem__`
if not isinstance(col_loc, slice):
# Boolean indexers can just be sliced into the columns object and
# then passed to `__getitem__`
if is_boolean_array(col_loc):
col_loc = self.df.columns[col_loc]
return self.df.__getitem__(col_loc)
else:
result_slice = self.df.columns.slice_locs(col_loc.start, col_loc.stop)
return self.df.iloc[:, slice(*result_slice)]

if isinstance(row_loc, Series) and is_boolean_array(row_loc):
return self._handle_boolean_masking(row_loc, col_loc)

row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
if any(i == -1 for i in row_lookup) or any(i == -1 for i in col_lookup):
raise KeyError(
"Passing list-likes to .loc or [] with any missing labels is no longer "
"supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"
)
result = super(_LocIndexer, self).__getitem__(row_lookup, col_lookup, ndim)
if isinstance(result, Series):
result._parent = self.df
result._parent_axis = 0
col_loc_as_list = [col_loc] if self.col_scalar else col_loc
row_loc_as_list = [row_loc] if self.row_scalar else row_loc
# Pandas drops the levels that are in the `loc`, so we have to as well.
if hasattr(result, "index") and isinstance(result.index, pandas.MultiIndex):
if (
isinstance(result, (Series, DataFrame))
and result._query_compiler.has_multiindex()
):
if (
isinstance(result, Series)
and not isinstance(col_loc_as_list, slice)
Expand All @@ -583,7 +662,7 @@ def __getitem__(self, key):
if (
hasattr(result, "columns")
and not isinstance(col_loc_as_list, slice)
and isinstance(result.columns, pandas.MultiIndex)
and result._query_compiler.has_multiindex(axis=1)
and all(
col_loc_as_list[i] in result.columns.levels[i]
for i in range(len(col_loc_as_list))
Expand Down Expand Up @@ -689,49 +768,84 @@ def _compute_lookup(self, row_loc, col_loc):
Returns
-------
row_lookup : numpy.ndarray
row_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise
List of index labels.
col_lookup : numpy.ndarray
col_lookup : slice(None) if full axis grab, pandas.RangeIndex if repetition is detected, numpy.ndarray otherwise
List of columns labels.
"""
row_loc = [row_loc] if is_scalar(row_loc) else row_loc
col_loc = [col_loc] if is_scalar(col_loc) else col_loc
if is_list_like(row_loc) and len(row_loc) == 1:
if (
isinstance(self.qc.index.values[0], np.datetime64)
and type(row_loc[0]) != np.datetime64
):
row_loc = [pandas.to_datetime(row_loc[0])]
if isinstance(row_loc, slice):
row_lookup = self.qc.index.get_indexer_for(
self.qc.index.to_series().loc[row_loc]
)
elif self.qc.has_multiindex():
if isinstance(row_loc, pandas.MultiIndex):
row_lookup = self.qc.index.get_indexer_for(row_loc)
else:
row_lookup = self.qc.index.get_locs(row_loc)
elif is_boolean_array(row_loc):
# If passed in a list of booleans, we return the index of the true values
row_lookup = [i for i, row_val in enumerate(row_loc) if row_val]
else:
row_lookup = self.qc.index.get_indexer_for(row_loc)
if isinstance(col_loc, slice):
col_lookup = self.qc.columns.get_indexer_for(
self.qc.columns.to_series().loc[col_loc]
)
elif self.qc.has_multiindex(axis=1):
if isinstance(col_loc, pandas.MultiIndex):
col_lookup = self.qc.columns.get_indexer_for(col_loc)
Notes
-----
Usage of `slice(None)` as a resulting lookup is a hack to pass information about
full-axis grab without computing actual indices that triggers lazy computations.
Ideally, this API should get rid of using slices as indexers and either use a
common ``Indexer`` object or range and ``np.ndarray`` only.
"""
lookups = []
for axis, axis_loc in enumerate((row_loc, col_loc)):
if is_scalar(axis_loc):
axis_loc = np.array([axis_loc])
if isinstance(axis_loc, slice) or is_range_like(axis_loc):
if isinstance(axis_loc, slice) and axis_loc == slice(None):
axis_lookup = axis_loc
else:
axis_labels = self.qc.get_axis(axis)
# `slice_indexer` returns a fully-defined numeric slice for a non-fully-defined labels-based slice
axis_lookup = axis_labels.slice_indexer(
axis_loc.start, axis_loc.stop, axis_loc.step
)
# Converting negative indices to their actual positions:
axis_lookup = pandas.RangeIndex(
start=(
axis_lookup.start
if axis_lookup.start >= 0
else axis_lookup.start + len(axis_labels)
),
stop=(
axis_lookup.stop
if axis_lookup.stop >= 0
else axis_lookup.stop + len(axis_labels)
),
step=axis_lookup.step,
)
elif self.qc.has_multiindex(axis):
# `Index.get_locs` raises an IndexError by itself if missing labels were provided,
# we don't have to do missing-check for the received `axis_lookup`.
if isinstance(axis_loc, pandas.MultiIndex):
axis_lookup = self.qc.get_axis(axis).get_indexer_for(axis_loc)
else:
axis_lookup = self.qc.get_axis(axis).get_locs(axis_loc)
elif is_boolean_array(axis_loc):
axis_lookup = boolean_mask_to_numeric(axis_loc)
else:
col_lookup = self.qc.columns.get_locs(col_loc)
elif is_boolean_array(col_loc):
# If passed in a list of booleans, we return the index of the true values
col_lookup = [i for i, col_val in enumerate(col_loc) if col_val]
else:
col_lookup = self.qc.columns.get_indexer_for(col_loc)
return row_lookup, col_lookup
axis_labels = self.qc.get_axis(axis)
if is_list_like(axis_loc) and not isinstance(
axis_loc, (np.ndarray, pandas.Index)
):
# `Index.get_indexer_for` works much faster with numpy arrays than with python lists,
# so although we lose some time here on converting to numpy, `Index.get_indexer_for`
# speedup covers the loss that we gain here.
axis_loc = np.array(axis_loc, dtype=axis_labels.dtype)
axis_lookup = axis_labels.get_indexer_for(axis_loc)
# `Index.get_indexer_for` sets -1 value for missing labels, we have to verify whether
# there are any -1 in the received indexer to raise a KeyError here.
missing_mask = axis_lookup < 0
if missing_mask.any():
missing_labels = (
# Converting `axis_loc` to maskable `np.array` to not fail
# on masking non-maskable list-like
np.array(axis_loc)[missing_mask]
if is_list_like(axis_loc)
# If `axis_loc` is not a list-like then we can't select certain
# labels that are missing and so printing the whole indexer
else axis_loc
)
raise KeyError(missing_labels)

if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup):
axis_lookup = axis_lookup.values

lookups.append(axis_lookup)
return lookups


class _iLocIndexer(_LocationIndexerBase):
Expand Down Expand Up @@ -770,6 +884,9 @@ def __getitem__(self, key):
self._check_dtypes(row_loc)
self._check_dtypes(col_loc)

if isinstance(row_loc, Series) and is_boolean_array(row_loc):
return self._handle_boolean_masking(row_loc, col_loc)

row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc)
result = super(_iLocIndexer, self).__getitem__(row_lookup, col_lookup, ndim)
if isinstance(result, Series):
Expand Down Expand Up @@ -833,25 +950,43 @@ def _compute_lookup(self, row_loc, col_loc):
Ideally, this API should get rid of using slices as indexers and either use a
common ``Indexer`` object or range and ``np.ndarray`` only.
"""
row_loc = [row_loc] if is_scalar(row_loc) else row_loc
col_loc = [col_loc] if is_scalar(col_loc) else col_loc
lookups = []
for axis, axis_loc in enumerate((row_loc, col_loc)):
if isinstance(axis_loc, slice) and axis_loc.step is None:
if is_scalar(axis_loc):
axis_loc = np.array([axis_loc])
if isinstance(axis_loc, slice):
axis_lookup = (
axis_loc
if axis_loc == slice(None)
else pandas.RangeIndex(
*axis_loc.indices(len(self.qc.get_axis(axis)))
)
)
else:
axis_lookup = (
pandas.RangeIndex(len(self.qc.get_axis(axis)))
.to_series()
.iloc[axis_loc]
.index
elif is_range_like(axis_loc):
axis_lookup = pandas.RangeIndex(
axis_loc.start, axis_loc.stop, axis_loc.step
)
elif is_boolean_array(axis_loc):
axis_lookup = boolean_mask_to_numeric(axis_loc)
else:
if isinstance(axis_loc, pandas.Index):
axis_loc = axis_loc.values
elif is_list_like(axis_loc) and not isinstance(axis_loc, np.ndarray):
# `Index.__getitem__` works much faster with numpy arrays than with python lists,
# so although we lose some time here on converting to numpy, `Index.__getitem__`
# speedup covers the loss that we gain here.
axis_loc = np.array(axis_loc, dtype=np.int64)
# Relatively fast check allows us to not trigger `self.qc.get_axis()` computation
# if there're no negative indices and so they don't not depend on the axis length.
if isinstance(axis_loc, np.ndarray) and not (axis_loc < 0).any():
axis_lookup = axis_loc
else:
axis_lookup = pandas.RangeIndex(len(self.qc.get_axis(axis)))[
axis_loc
]

if isinstance(axis_lookup, pandas.Index) and not is_range_like(axis_lookup):
axis_lookup = axis_lookup.values
lookups.append(axis_lookup)
return lookups

Expand All @@ -871,8 +1006,8 @@ def _check_dtypes(self, locator):
"""
is_int = is_integer(locator)
is_int_slice = is_integer_slice(locator)
is_int_list = is_list_like(locator) and all(map(is_integer, locator))
is_int_arr = is_integer_array(locator)
is_bool_arr = is_boolean_array(locator)

if not any([is_int, is_int_slice, is_int_list, is_bool_arr]):
if not any([is_int, is_int_slice, is_int_arr, is_bool_arr]):
raise ValueError(_ILOC_INT_ONLY_ERROR)

0 comments on commit 0947ee8

Please sign in to comment.