modin-project · YarShev · May 3, 2024 · Mar 28, 2024 · Apr 4, 2024 · Apr 4, 2024
@@ -28,7 +28,7 @@
 from pandas.core.dtypes.common import is_dtype_equal, is_list_like, is_numeric_dtype
 from pandas.core.indexes.api import Index, RangeIndex
 
-from modin.config import Engine, IsRayCluster, MinPartitionSize, NPartitions
+from modin.config import CpuCount, Engine, IsRayCluster, MinPartitionSize, NPartitions
 from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe
 from modin.core.dataframe.base.dataframe.utils import Axis, JoinType, is_trivial_index
 from modin.core.dataframe.pandas.dataframe.utils import (
@@ -2202,12 +2202,47 @@
         PandasDataframe
             A new dataframe.
         """
-        map_fn = (
-            self._partition_mgr_cls.lazy_map_partitions
-            if lazy
-            else self._partition_mgr_cls.map_partitions
-        )
-        new_partitions = map_fn(self._partitions, func, func_args, func_kwargs)
+        if self.num_parts <= 1.5 * CpuCount.get():
+            # block-wise map
+            map_fn = (
+                self._partition_mgr_cls.lazy_map_partitions
+                if lazy
+                else self._partition_mgr_cls.map_partitions
+            )
+            new_partitions = map_fn(self._partitions, func, func_args, func_kwargs)
+        else:
+            # axis-wise map
+            # we choose an axis for a combination of partitions
+            # whose size is closer to the number of CPUs
+            if abs(self._partitions.shape[0] - CpuCount.get()) < abs(
+                self._partitions.shape[1] - CpuCount.get()
+            ):
+                axis = 1
+            else:
+                axis = 0
+
+            column_splits = CpuCount.get() // self._partitions.shape[1]
+
+            if axis == 0 and column_splits > 1:
+                # splitting by parts of columnar partitions
+                new_partitions = (
+                    self._partition_mgr_cls.map_partitions_joined_by_column(
+                        self._partitions, column_splits, func, func_args, func_kwargs
+                    )
+                )
+            else:
+                # splitting by full axis partitions
+                new_partitions = self._partition_mgr_cls.map_axis_partitions(
+                    axis,
+                    self._partitions,
+                    lambda df: func(
+                        df,
+                        *(func_args if func_args is not None else ()),
+                        **(func_kwargs if func_kwargs is not None else {}),
+                    ),
+                    keep_partitioning=True,
+                )
+
         if new_columns is not None and self.has_materialized_columns:
             assert len(new_columns) == len(
                 self.columns

@@ -745,6 +745,77 @@
             **kwargs,
         )
 
+    @classmethod
+    def map_partitions_joined_by_column(
+        cls,
+        partitions,
+        column_splits,
+        map_func,
+        map_func_args=None,
+        map_func_kwargs=None,
+    ):
+        """
+        Combine several blocks by column into one virtual partition and apply "map_func" to them.
+
+        Parameters
+        ----------
+        partitions : NumPy 2D array
+            Partitions of Modin Frame.
+        column_splits : int
+            The number of splits by column.
+        map_func : callable
+            Function to apply.
+        map_func_args : iterable, optional
+            Positional arguments for the 'map_func'.
+        map_func_kwargs : dict, optional
+            Keyword arguments for the 'map_func'.
+
+        Returns
+        -------
+        NumPy array
+            An array of new partitions for Modin Frame.
+        """
+        if column_splits < 1:
+            raise ValueError(
+                "The value of columns_splits must be greater than or equal to 1."
+            )
+        # step cannot be less than 1
+        step = max(partitions.shape[0] // column_splits, 1)
+        new_partitions = np.array(
+            [
+                cls.column_partitions(
+                    partitions[i : i + step],
+                    # full_axis=False,
+                )
+                for i in range(
+                    0,
+                    partitions.shape[0],
+                    step,
+                )
+            ]
+        )
+        preprocessed_map_func = cls.preprocess_func(map_func)
+        kw = {
+            "num_splits": step,
+        }
+        return np.concatenate(
+            [
+                np.stack(
+                    [
+                        part.apply(
+                            preprocessed_map_func,
+                            *map_func_args if map_func_args is not None else (),
+                            **kw,
+                            **map_func_kwargs if map_func_kwargs is not None else {},
+                        )
+                        for part in row_parts
+                    ],
+                    axis=-1,
+                )
+                for row_parts in new_partitions
+            ]
+        )
+
     @classmethod
     def concat(cls, axis, left_parts, right_parts):
         """

diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py
@@ -17,7 +17,7 @@
 import pytest
 
 import modin.pandas as pd
-from modin.config import MinPartitionSize, NPartitions, StorageFormat
+from modin.config import CpuCount, MinPartitionSize, NPartitions, StorageFormat
 from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype
 from modin.core.storage_formats.pandas.utils import split_result_of_axis_func_pandas
 from modin.pandas.testing import assert_index_equal, assert_series_equal
@@ -223,13 +223,51 @@ def test_add_suffix(data, axis):
     df_equals(new_modin_df.columns, new_pandas_df.columns)
 
 
+map_strtagies = ["map", "axis_map", "splitted_axis_map"]
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 @pytest.mark.parametrize("testfunc", test_func_values, ids=test_func_keys)
 @pytest.mark.parametrize(
     "na_action", [None, "ignore"], ids=["no_na_action", "ignore_na"]
 )
-def test_applymap(data, testfunc, na_action):
+@pytest.mark.parametrize("map_strtagy", map_strtagies, ids=map_strtagies)
+def test_applymap(data, testfunc, na_action, map_strtagy):
+    keys = list(data.keys())
+    epected_shape = None
+    if map_strtagy == "map":
+        epected_shape = (1, 1)
+        max_size = max(len(keys), len(data[keys[0]]))
+        MinPartitionSize.put(max_size)
+    elif map_strtagy == "axis_map":
+        epected_shape = (CpuCount.get(), CpuCount.get())
+        min_size = min(len(keys), len(data[keys[0]]))
+        required_size = min_size // CpuCount.get()
+        if required_size > 0:
+            MinPartitionSize.put(required_size)
+            data = {k: v[:min_size] for k, v in data.items() if k in keys[:min_size]}
+        else:
+            pytest.skip(
+                "The stratagy cannot be tested with the currect data if required_size less than 1"
+            )
+    elif map_strtagy == "splitted_axis_map":
+        epected_shape = (2 * CpuCount.get(), 1)
+        required_size = len(data[keys[0]]) // (CpuCount.get() * 2)
+        # the stratagy cannot be tested with the currect data if required_size less than 1
+        if required_size > 0:
+            MinPartitionSize.put(required_size)
+            data = {k: v for k, v in data.items() if k in keys[:required_size]}
+        else:
+            pytest.skip(
+                "The stratagy cannot be tested with the currect data if required_size less than 1"
+            )
+    else:
+        raise ValueError("Incorrect map_strtagy")
+
     modin_df, pandas_df = create_test_dfs(data)
+    assert (
+        modin_df._query_compiler._modin_frame._partitions.shape == epected_shape
+    ), "Incorrect shape of partitions, please check data preparating."
 
     with pytest.raises(ValueError):
         x = 2