BENCH: add some cases for join and merge ops from pandas (#5021)

Signed-off-by: Myachev <anatoly.myachev@intel.com>
modin-project · Oct 10, 2022 · abcf1e9 · abcf1e9
1 parent d005429
commit abcf1e9
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 1 deletion.
diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -20,6 +20,7 @@
 # measurements
 
 import numpy as np
+import pandas._testing as tm
 
 from .utils import (
     generate_dataframe,
@@ -127,12 +128,56 @@ def time_join(self, shapes, how, sort):
         execute(self.df1.join(self.df2, how=how, lsuffix="left_", sort=sort))
 
 
+class TimeJoinStringIndex:
+    param_names = ["shapes", "sort"]
+    params = [
+        get_benchmark_shapes("TimeJoinStringIndex"),
+        [True, False],
+    ]
+
+    def setup(self, shapes, sort):
+        assert shapes[0] % 100 == 0, "implementation restriction"
+        level1 = tm.makeStringIndex(10).values
+        level2 = tm.makeStringIndex(shapes[0] // 100).values
+        codes1 = np.arange(10).repeat(shapes[0] // 100)
+        codes2 = np.tile(np.arange(shapes[0] // 100), 10)
+        index2 = IMPL.MultiIndex(levels=[level1, level2], codes=[codes1, codes2])
+        self.df_multi = IMPL.DataFrame(
+            np.random.randn(len(index2), 4), index=index2, columns=["A", "B", "C", "D"]
+        )
+
+        self.key1 = np.tile(level1.take(codes1), 10)
+        self.key2 = np.tile(level2.take(codes2), 10)
+        self.df = generate_dataframe("int", *shapes, RAND_LOW, RAND_HIGH)
+        # just to keep source shape
+        self.df = self.df.drop(columns=self.df.columns[-2:])
+        self.df["key1"] = self.key1
+        self.df["key2"] = self.key2
+        execute(self.df)
+
+        self.df_key1 = IMPL.DataFrame(
+            np.random.randn(len(level1), 4), index=level1, columns=["A", "B", "C", "D"]
+        )
+        self.df_key2 = IMPL.DataFrame(
+            np.random.randn(len(level2), 4), index=level2, columns=["A", "B", "C", "D"]
+        )
+
+    def time_join_dataframe_index_multi(self, shapes, sort):
+        execute(self.df.join(self.df_multi, on=["key1", "key2"], sort=sort))
+
+    def time_join_dataframe_index_single_key_bigger(self, shapes, sort):
+        execute(self.df.join(self.df_key2, on="key2", sort=sort))
+
+    def time_join_dataframe_index_single_key_small(self, shapes, sort):
+        execute(self.df.join(self.df_key1, on="key1", sort=sort))
+
+
 class TimeMerge:
     param_names = ["shapes", "how", "sort"]
     params = [
         get_benchmark_shapes("TimeMerge"),
         ["left", "inner"],
-        [False],
+        [True, False],
     ]
 
     def setup(self, shapes, how, sort):
@@ -147,6 +192,19 @@ def time_merge(self, shapes, how, sort):
             )
         )
 
+    def time_merge_default(self, shapes, how, sort):
+        execute(IMPL.merge(self.df1, self.df2, how=how, sort=sort))
+
+    def time_merge_dataframe_empty_right(self, shapes, how, sort):
+        # Getting an empty dataframe using `iloc` should be very fast,
+        # so the impact on the time of the merge operation should be negligible.
+        execute(IMPL.merge(self.df1, self.df2.iloc[:0], how=how, sort=sort))
+
+    def time_merge_dataframe_empty_left(self, shapes, how, sort):
+        # Getting an empty dataframe using `iloc` should be very fast,
+        # so the impact on the time of the merge operation should be negligible.
+        execute(IMPL.merge(self.df1.iloc[:0], self.df2, how=how, sort=sort))
+
 
 class TimeMergeCategoricals:
     param_names = ["shapes", "data_type"]
@@ -759,3 +817,6 @@ def time_columns(self, shape):
 
     def time_index(self, shape):
         return self.df.index
+
+
+from .utils import setup  # noqa: E402, F401
diff --git a/asv_bench/benchmarks/utils/__init__.py b/asv_bench/benchmarks/utils/__init__.py
@@ -32,6 +32,7 @@
     random_booleans,
     translator_groupby_ngroups,
     trigger_import,
+    setup,
 )
 
 __all__ = [
@@ -54,4 +55,5 @@
     "random_booleans",
     "translator_groupby_ngroups",
     "trigger_import",
+    "setup",
 ]
diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py
@@ -594,3 +594,10 @@ def prepare_io_data_parquet(test_filename: str, data_type: str, shapes: list):
         df.to_parquet(test_filenames[shape_id], index=False)
 
     return test_filenames
+
+
+def setup(*args, **kwargs):  # noqa: GL08
+    # This function just needs to be imported into each benchmark file to
+    # set up the random seed before each function. ASV run it automatically.
+    # https://asv.readthedocs.io/en/latest/writing_benchmarks.html
+    np.random.seed(42)
diff --git a/asv_bench/benchmarks/utils/data_shapes.py b/asv_bench/benchmarks/utils/data_shapes.py
@@ -169,6 +169,9 @@
 DEFAULT_CONFIG["MergeCategoricals"] = (
     [[10_000, 2]] if ASV_DATASET_SIZE == "big" else [[1_000, 2]]
 )
+DEFAULT_CONFIG["TimeJoinStringIndex"] = (
+    [[100_000, 64]] if ASV_DATASET_SIZE == "big" else [[1_000, 4]]
+)
 for config in (_DEFAULT_CONFIG_T, _DEFAULT_HDK_CONFIG_T):
     for _shape, _names in config:
         DEFAULT_CONFIG.update({_name: _shape for _name in _names})