Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix estimate_pandas_size for pd.MultiIndex #2707

Merged
merged 3 commits into from
Feb 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mars/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
from typing import NamedTuple, Optional

version_info = (0, 9, 0, "a2")
version_info = (0, 9, 0, "b1")
_num_index = max(idx if isinstance(v, int) else 0 for idx, v in enumerate(version_info))
__version__ = ".".join(map(str, version_info[: _num_index + 1])) + "".join(
version_info[_num_index + 1 :]
Expand Down
43 changes: 42 additions & 1 deletion mars/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,11 +508,52 @@ def test_estimate_pandas_size():

s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
assert utils.estimate_pandas_size(s3) != sys.getsizeof(s3)
assert (
pytest.approx(utils.estimate_pandas_size(s3) / sys.getsizeof(s3), abs=0.5) == 1
)

idx1 = pd.MultiIndex.from_arrays(
[np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))]
)
assert utils.estimate_pandas_size(idx1) != sys.getsizeof(idx1)
assert utils.estimate_pandas_size(idx1) == sys.getsizeof(idx1)

string_idx = pd.Index(np.random.choice(["a", "bb", "cc"], size=(1000,)))
assert utils.estimate_pandas_size(string_idx) != sys.getsizeof(string_idx)
assert (
pytest.approx(
utils.estimate_pandas_size(string_idx) / sys.getsizeof(string_idx), abs=0.5
)
== 1
)

# dataframe with multi index
idx2 = pd.MultiIndex.from_arrays(
[np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))]
)
df4 = pd.DataFrame(
{
"A": np.random.choice(["abcd", "def", "gh"], size=(1000,)),
"B": np.random.rand(1000),
"C": np.random.rand(1000),
},
index=idx2,
)
assert utils.estimate_pandas_size(df4) != sys.getsizeof(df4)
assert (
pytest.approx(utils.estimate_pandas_size(df4) / sys.getsizeof(df4), abs=0.5)
== 1
)

# series with multi index
idx3 = pd.MultiIndex.from_arrays(
[
np.random.choice(["a1", "a2", "a3"], size=(1000,)),
np.random.choice(["abcd", "def", "gh"], size=(1000,)),
]
)
s4 = pd.Series(np.arange(1000), index=idx3)

assert utils.estimate_pandas_size(s4) == sys.getsizeof(s4)


@require_ray
Expand Down
47 changes: 31 additions & 16 deletions mars/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,10 +424,13 @@ def calc_data_size(dt: Any, shape: Tuple[int] = None) -> int:


def estimate_pandas_size(
df_obj, max_samples: int = 10, min_sample_rows: int = 100
pd_obj, max_samples: int = 10, min_sample_rows: int = 100
) -> int:
if len(df_obj) <= min_sample_rows or isinstance(df_obj, pd.RangeIndex):
return sys.getsizeof(df_obj)
if len(pd_obj) <= min_sample_rows or isinstance(pd_obj, pd.RangeIndex):
return sys.getsizeof(pd_obj)
if isinstance(pd_obj, pd.MultiIndex):
# MultiIndex's sample size can't be used to estimate
return sys.getsizeof(pd_obj)

from .dataframe.arrays import ArrowDtype

Expand All @@ -438,14 +441,16 @@ def _is_fast_dtype(dtype):
return isinstance(dtype, ArrowDtype)

dtypes = []
if isinstance(df_obj, pd.DataFrame):
dtypes.extend(df_obj.dtypes)
index_obj = df_obj.index
elif isinstance(df_obj, pd.Series):
dtypes.append(df_obj.dtype)
index_obj = df_obj.index
is_series = False
if isinstance(pd_obj, pd.DataFrame):
dtypes.extend(pd_obj.dtypes)
index_obj = pd_obj.index
elif isinstance(pd_obj, pd.Series):
dtypes.append(pd_obj.dtype)
index_obj = pd_obj.index
is_series = True
else:
index_obj = df_obj
index_obj = pd_obj

# handling possible MultiIndex
if hasattr(index_obj, "dtypes"):
Expand All @@ -454,12 +459,22 @@ def _is_fast_dtype(dtype):
dtypes.append(index_obj.dtype)

if all(_is_fast_dtype(dtype) for dtype in dtypes):
return sys.getsizeof(df_obj)

indices = np.sort(np.random.choice(len(df_obj), size=max_samples, replace=False))
iloc = df_obj if isinstance(df_obj, pd.Index) else df_obj.iloc
sample_size = sys.getsizeof(iloc[indices])
return sample_size * len(df_obj) // max_samples
return sys.getsizeof(pd_obj)

indices = np.sort(np.random.choice(len(pd_obj), size=max_samples, replace=False))
iloc = pd_obj if isinstance(pd_obj, pd.Index) else pd_obj.iloc
if isinstance(index_obj, pd.MultiIndex):
# MultiIndex's sample size is much greater than expected, thus we calculate
# the size separately.
index_size = sys.getsizeof(pd_obj.index)
if is_series:
sample_frame_size = iloc[indices].memory_usage(deep=True, index=False)
else:
sample_frame_size = iloc[indices].memory_usage(deep=True, index=False).sum()
return index_size + sample_frame_size * len(pd_obj) // max_samples
else:
sample_size = sys.getsizeof(iloc[indices])
return sample_size * len(pd_obj) // max_samples


def build_fetch_chunk(
Expand Down