Skip to content

Commit

Permalink
PERF: MultiIndex.memory_usage shouldn't trigger the index engine
Browse files Browse the repository at this point in the history
Ignore the index engine when it isn't already cached.
  • Loading branch information
GianlucaFicarelli committed Apr 23, 2024
1 parent b111ac6 commit 6d4e407
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ Performance improvements
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
- Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`)
- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.bug_fixes:
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4841,8 +4841,9 @@ def _from_join_target(self, result: np.ndarray) -> ArrayLike:
def memory_usage(self, deep: bool = False) -> int:
result = self._memory_usage(deep=deep)

# include our engine hashtable
result += self._engine.sizeof(deep=deep)
# include our engine hashtable, only if it's already cached
if "_engine" in self._cache:
result += self._engine.sizeof(deep=deep)
return result

@final
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,8 +1391,9 @@ def _nbytes(self, deep: bool = False) -> int:
names_nbytes = sum(getsizeof(i, objsize) for i in self.names)
result = level_nbytes + label_nbytes + names_nbytes

# include our engine hashtable
result += self._engine.sizeof(deep=deep)
# include our engine hashtable, only if it's already cached
if "_engine" in self._cache:
result += self._engine.sizeof(deep=deep)
return result

# --------------------------------------------------------------------
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/base/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,31 @@ def test_memory_usage_components_narrow_series(any_real_numpy_dtype):
assert total_usage == non_index_usage + index_usage


def test_memory_usage_doesnt_trigger_engine(index):
index._cache.clear()
assert "_engine" not in index._cache

res_without_engine = index.memory_usage()
assert "_engine" not in index._cache

# explicitly load and cache the engine
_ = index._engine
assert "_engine" in index._cache

res_with_engine = index.memory_usage()

# the engine doesn't affect the result even when initialized with values, because
# index._engine.sizeof() doesn't consider the content of index._engine.values
assert res_with_engine == res_without_engine

if len(index) == 0:
assert res_without_engine == 0
assert res_with_engine == 0
else:
assert res_without_engine > 0
assert res_with_engine > 0


def test_searchsorted(request, index_or_series_obj):
# numpy.searchsorted calls obj.searchsorted under the hood.
# See gh-12238
Expand Down

0 comments on commit 6d4e407

Please sign in to comment.