Skip to content

Commit

Permalink
TEST-#3750: Create a variable for minimal partition size. (#3751)
Browse files Browse the repository at this point in the history
Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com>
Co-authored-by: Alexey Prutskov <alexey.prutskov@intel.com>
Signed-off-by: mvashishtha <mahesh@ponder.io>
  • Loading branch information
3 people committed Dec 7, 2021
1 parent d590de0 commit 1be66d1
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 7 deletions.
12 changes: 12 additions & 0 deletions modin/config/envvars.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,18 @@ def get(self):
return result


class MinPartitionSize(EnvironmentVariable, type=int):
"""
Minimum number of rows/columns in a single pandas partition split.
Once a partition for a pandas dataframe has more than this many elements,
Modin adds another partition.
"""

varname = "MODIN_MIN_PARTITION_SIZE"
default = 32


def _check_vars():
"""
Check validity of environment variables.
Expand Down
14 changes: 12 additions & 2 deletions modin/core/storage_formats/pandas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

"""Contains utility functions for frame partitioning."""

from modin.config import MinPartitionSize
import numpy as np
import pandas

Expand All @@ -38,7 +39,12 @@ def get_default_chunksize(length, num_splits):
)


def compute_chunksize(df, num_splits, default_block_size=32, axis=None):
def compute_chunksize(
df,
num_splits,
default_block_size=None,
axis=None,
):
"""
Compute the number of rows and/or columns to include in each partition.
Expand All @@ -48,8 +54,9 @@ def compute_chunksize(df, num_splits, default_block_size=32, axis=None):
DataFrame to split.
num_splits : int
Number of splits to separate the DataFrame into.
default_block_size : int, default: 32
default_block_size : int, optional
Minimum number of rows/columns in a single split.
If not specified, the value is assumed equal to ``MinPartitionSize``.
axis : int, optional
Axis to split across. If not specified - split accros both axes.
Expand All @@ -59,6 +66,9 @@ def compute_chunksize(df, num_splits, default_block_size=32, axis=None):
If axis is 1 or 0, returns an integer number of rows/columns to split the
DataFrame. If axis is None, returns a tuple containing both.
"""
default_block_size = (
MinPartitionSize.get() if default_block_size is None else default_block_size
)
if axis == 0 or axis is None:
row_chunksize = get_default_chunksize(len(df.index), num_splits)
# Take the min of the default and the memory-usage chunksize first to avoid a
Expand Down
3 changes: 2 additions & 1 deletion modin/pandas/test/dataframe/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import numpy as np
import pandas
import matplotlib
from modin.config import MinPartitionSize
import modin.pandas as pd

from pandas.core.dtypes.common import is_list_like
Expand Down Expand Up @@ -181,7 +182,7 @@ def test_explode_all_partitions(column, ignore_index):
# expand every row in the input data into two rows. It's especially
# important that the input data has list-like elements that must be
# expanded at the boundaries of the partitions, e.g. at row 31.
num_rows = NPartitions.get() * 32
num_rows = NPartitions.get() * MinPartitionSize.get()
data = {"A": [[3, 4]] * num_rows, "C": [["a", "b"]] * num_rows}
eval_general(
*create_test_dfs(data),
Expand Down
8 changes: 4 additions & 4 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
assert_extension_array_equal,
)
from pandas.core.dtypes.common import is_list_like
from modin.config.envvars import NPartitions
from modin.config import MinPartitionSize, NPartitions
import modin.pandas as pd
from modin.utils import to_pandas, try_cast_to_pandas
from modin.config import TestDatasetSize, TrackFileLeaks
Expand Down Expand Up @@ -211,11 +211,11 @@

# Fully fill all of the partitions used in tests.
test_data_large_categorical_dataframe = {
i: pandas.Categorical(np.arange(NPartitions.get() * 32))
for i in range(NPartitions.get() * 32)
i: pandas.Categorical(np.arange(NPartitions.get() * MinPartitionSize.get()))
for i in range(NPartitions.get() * MinPartitionSize.get())
}
test_data_large_categorical_series_values = [
pandas.Categorical(np.arange(NPartitions.get() * 32))
pandas.Categorical(np.arange(NPartitions.get() * MinPartitionSize.get()))
]
test_data_large_categorical_series_keys = ["categorical_series"]

Expand Down

0 comments on commit 1be66d1

Please sign in to comment.