From 3ab86011b42ae52e03403539632ea9ad26c56fdd Mon Sep 17 00:00:00 2001 From: Gianluca Ficarelli Date: Wed, 24 Apr 2024 17:30:04 +0200 Subject: [PATCH] PERF: MultiIndex._engine use smaller dtypes --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/index.pyi | 4 +- pandas/_libs/index.pyx | 45 +++++++-- pandas/core/indexes/multi.py | 106 ++++++++------------ pandas/tests/indexes/multi/test_indexing.py | 41 +++++--- 5 files changed, 111 insertions(+), 86 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 59cc709359a8d..6a46b9a0f8fad 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -332,6 +332,7 @@ Performance improvements - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`) - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`) - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`) +- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`) - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`) - Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`) - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 12a5bf245977e..bf6d8ba8973d3 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] - offsets: np.ndarray # ndarray[uint64_t, ndim=1] + offsets: np.ndarray # np.ndarray[..., ndim=1] def __init__( self, levels: list[Index], # all entries hashable labels: list[np.ndarray], # all entries integer-dtyped - offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] + offsets: np.ndarray, # np.ndarray[..., ndim=1] ) -> None: ... def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index a700074d46ba8..1541145dc0f12 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -712,14 +712,16 @@ cdef class BaseMultiIndexCodesEngine: Pre-calculated offsets, one for each level of the index. """ self.levels = levels - self.offsets = offsets + # Downcast the type if possible, to prevent upcasting when shifting codes: + self.offsets = offsets.astype(np.min_scalar_type(offsets[0]), copy=False) # Transform labels in a single array, and add 2 so that we are working # with positive integers (-1 for NaN becomes 1). This enables us to # differentiate between values that are missing in other and matching # NaNs. We will set values that are not found to 0 later: - labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift - codes = labels_arr.astype("uint64", copy=False) + codes = np.array(labels).T + codes += multiindex_nulls_shift # inplace sum optimisation + self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously @@ -731,8 +733,37 @@ cdef class BaseMultiIndexCodesEngine: # integers representing labels: we will use its get_loc and get_indexer self._base.__init__(self, lab_ints) - def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: - raise NotImplementedError("Implemented by subclass") # pragma: no cover + def _codes_to_ints(self, ndarray codes) -> np.ndarray: + """ + Transform combination(s) of uint in one uint or Python integer (each), in a + strictly monotonic way (i.e. respecting the lexicographic order of integer + combinations). + + Parameters + ---------- + codes : 1- or 2-dimensional array of dtype uint + Combinations of integers (one per row) + + Returns + ------- + scalar or 1-dimensional array, of dtype _codes_dtype + Integer(s) representing one combination (each). + """ + # To avoid overflows, first make sure we are working with the right dtype: + codes = codes.astype(self._codes_dtype, copy=False) + + # Shift the representation of each level by the pre-calculated number of bits: + codes <<= self.offsets # inplace shift optimisation + + # Now sum and OR are in fact interchangeable. This is a simple + # composition of the (disjunct) significant bits of each level (i.e. + # each column in "codes") in a single positive integer (per row): + if codes.ndim == 1: + # Single key + return np.bitwise_or.reduce(codes) + + # Multiple keys + return np.bitwise_or.reduce(codes, axis=1) def _extract_level_codes(self, target) -> np.ndarray: """ @@ -757,7 +788,7 @@ cdef class BaseMultiIndexCodesEngine: codes[codes > 0] += 1 if self.level_has_nans[i]: codes[target.codes[i] == -1] += 1 - return self._codes_to_ints(np.array(level_codes, dtype="uint64").T) + return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T) def get_indexer(self, target: np.ndarray) -> np.ndarray: """ @@ -788,7 +819,7 @@ cdef class BaseMultiIndexCodesEngine: raise KeyError(key) # Transform indices into single integer: - lab_int = self._codes_to_ints(np.array(indices, dtype="uint64")) + lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype)) return self._base.get_loc(self, lab_int) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c8e16fad00d5b..19c29515d3ecc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -123,84 +123,56 @@ ) -class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): - """ - This class manages a MultiIndex by mapping label combinations to positive - integers. +class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. + + The number of possible label combinations must not overflow the 64 bits integers. """ _base = libindex.UInt64Engine + _codes_dtype = "uint64" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one uint64 (each), in a strictly - monotonic way (i.e. respecting the lexicographic order of integer - combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - scalar or 1-dimensional array, of dtype uint64 - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits: - codes <<= self.offsets + The number of possible label combinations must not overflow the 32 bits integers. + """ - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer: - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + _base = libindex.UInt32Engine + _codes_dtype = "uint32" - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) +class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. -class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): - """ - This class manages those (extreme) cases in which the number of possible - label combinations overflows the 64 bits integers, and uses an ObjectEngine - containing Python integers. + The number of possible label combinations must not overflow the 16 bits integers. """ - _base = libindex.ObjectEngine + _base = libindex.UInt16Engine + _codes_dtype = "uint16" - def _codes_to_ints(self, codes): - """ - Transform combination(s) of uint64 in one Python integer (each), in a - strictly monotonic way (i.e. respecting the lexicographic order of - integer combinations): see BaseMultiIndexCodesEngine documentation. - Parameters - ---------- - codes : 1- or 2-dimensional array of dtype uint64 - Combinations of integers (one per row) +class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine): + """Manages a MultiIndex by mapping label combinations to positive integers. - Returns - ------- - int, or 1-dimensional array of dtype object - Integer(s) representing one combination (each). - """ - # Shift the representation of each level by the pre-calculated number - # of bits. Since this can overflow uint64, first make sure we are - # working with Python integers: - codes = codes.astype("object") << self.offsets + The number of possible label combinations must not overflow the 8 bits integers. + """ - # Now sum and OR are in fact interchangeable. This is a simple - # composition of the (disjunct) significant bits of each level (i.e. - # each column in "codes") in a single positive integer (per row): - if codes.ndim == 1: - # Single key - return np.bitwise_or.reduce(codes) + _base = libindex.UInt8Engine + _codes_dtype = "uint8" - # Multiple keys - return np.bitwise_or.reduce(codes, axis=1) + +class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine): + """Manages a MultiIndex by mapping label combinations to positive integers. + + This class manages those (extreme) cases in which the number of possible + label combinations overflows the 64 bits integers, and uses an ObjectEngine + containing Python integers. + """ + + _base = libindex.ObjectEngine + _codes_dtype = "object" def names_compat(meth: F) -> F: @@ -1235,7 +1207,17 @@ def _engine(self): if lev_bits[0] > 64: # The levels would overflow a 64 bit uint - use Python integers: return MultiIndexPyIntEngine(self.levels, self.codes, offsets) - return MultiIndexUIntEngine(self.levels, self.codes, offsets) + if lev_bits[0] > 32: + # The levels would overflow a 32 bit uint - use uint64 + return MultiIndexUInt64Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 16: + # The levels would overflow a 16 bit uint - use uint8 + return MultiIndexUInt32Engine(self.levels, self.codes, offsets) + if lev_bits[0] > 8: + # The levels would overflow a 8 bit uint - use uint16 + return MultiIndexUInt16Engine(self.levels, self.codes, offsets) + # The levels fit in an 8 bit uint - use uint8 + return MultiIndexUInt8Engine(self.levels, self.codes, offsets) # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return # type "Type[MultiIndex]" in supertype "Index" diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 18d64999de496..f08a7625e7f8a 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -919,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id assert result == expected -def test_pyint_engine(): +@pytest.mark.parametrize( + "N, expected_dtype", + [ + (1, "uint8"), # 2*4*N = 8 + (2, "uint16"), # 2*4*N = 16 + (4, "uint32"), # 2*4*N = 32 + (8, "uint64"), # 2*4*N = 64 + (10, "object"), # 2*4*N = 80 + ], +) +def test_pyint_engine(N, expected_dtype): # GH#18519 : when combinations of codes cannot be represented in 64 # bits, the index underlying the MultiIndex engine works with Python # integers, rather than uint64. - N = 5 keys = [ tuple(arr) for arr in [ - [0] * 10 * N, - [1] * 10 * N, - [2] * 10 * N, - [np.nan] * N + [2] * 9 * N, - [0] * N + [2] * 9 * N, - [np.nan] * N + [2] * 8 * N + [0] * N, + [0] * 4 * N, + [1] * 4 * N, + [np.nan] * N + [0] * 3 * N, + [0] * N + [1] * 3 * N, + [np.nan] * N + [1] * 2 * N + [0] * N, ] ] - # Each level contains 4 elements (including NaN), so it is represented - # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a - # 64 bit engine and truncating the first levels, the fourth and fifth - # keys would collide; if truncating the last levels, the fifth and - # sixth; if rotating bits rather than shifting, the third and fifth. + # Each level contains 3 elements (NaN, 0, 1), and it's represented + # in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for + # a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4. + # If we were using a 64 bit engine and truncating the first levels, the + # fourth and fifth keys would collide; if truncating the last levels, the + # fifth and sixth; if rotating bits rather than shifting, the third and fifth. + + index = MultiIndex.from_tuples(keys) + assert index._engine.values.dtype == expected_dtype for idx, key_value in enumerate(keys): - index = MultiIndex.from_tuples(keys) assert index.get_loc(key_value) == idx expected = np.arange(idx + 1, dtype=np.intp) @@ -952,7 +963,7 @@ def test_pyint_engine(): # With missing key: idces = range(len(keys)) expected = np.array([-1] + list(idces), dtype=np.intp) - missing = tuple([0, 1] * 5 * N) + missing = tuple([0, 1, 0, 1] * N) result = index.get_indexer([missing] + [keys[i] for i in idces]) tm.assert_numpy_array_equal(result, expected)