TEST-#3566: make Indexing benchmarks more representative (#3700)

Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
modin-project · Nov 29, 2021 · 0582aa2 · 0582aa2
1 parent d8963a6
commit 0582aa2
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 41 deletions.
diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -36,6 +36,7 @@
     execute,
     translator_groupby_ngroups,
     get_benchmark_shapes,
+    trigger_import,
 )
 
 
@@ -497,29 +498,73 @@ class TimeIndexing:
     param_names = ["shape", "indexer_type"]
     params = [
         get_benchmark_shapes("TimeIndexing"),
-        ["scalar", "bool", "slice", "list", "function"],
+        [
+            "bool_array",
+            "bool_series",
+            "scalar",
+            "slice",
+            "continuous_slice",
+            "numpy_array_take_all_values",
+            "python_list_take_10_values",
+            "function",
+        ],
     ]
 
+    indexer_getters = {
+        "bool_array": lambda df: np.array([False, True] * (len(df) // 2)),
+        # This boolean-Series is a projection of the source frame, it shouldn't
+        # be reimported or triggered to execute:
+        "bool_series": lambda df: df.iloc[:, 0] > 50,
+        "scalar": lambda df: len(df) // 2,
+        "slice": lambda df: slice(0, len(df), 2),
+        "continuous_slice": lambda df: slice(len(df) // 2),
+        "numpy_array_take_all_values": lambda df: np.arange(len(df)),
+        "python_list_take_10_values": lambda df: list(range(min(10, len(df)))),
+        "function": lambda df: (lambda df: df.index[::-2]),
+    }
+
     def setup(self, shape, indexer_type):
         self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
-        if indexer_type == "bool":
-            self.indexer = [False, True] * (shape[0] // 2)
-        elif indexer_type == "scalar":
-            self.indexer = shape[0] // 2
-        elif indexer_type == "slice":
-            self.indexer = slice(0, shape[0], 2)
-        elif indexer_type == "list":
-            self.indexer = [x for x in range(shape[0])]
-        elif indexer_type == "function":
-            self.indexer = lambda df: df.index[::-2]
+        trigger_import(self.df)
+
+        self.indexer = self.indexer_getters[indexer_type](self.df)
+        if isinstance(self.indexer, (pd.Series, pd.DataFrame)):
+            # HACK: Triggering `dtypes` meta-data computation in advance,
+            # so it won't affect the `loc/iloc` time:
+            self.indexer.dtypes
 
     def time_iloc(self, shape, indexer_type):
-        execute(self.df.iloc[self.indexer])
+        # Pandas doesn't implement `df.iloc[series boolean_mask]` and raises an exception on it.
+        # Replacing this with the semantically equivalent construction:
+        if indexer_type != "bool_series":
+            execute(self.df.iloc[self.indexer])
+        else:
+            execute(self.df[self.indexer])
 
     def time_loc(self, shape, indexer_type):
         execute(self.df.loc[self.indexer])
 
 
+class TimeIndexingColumns:
+    param_names = ["shape"]
+    params = [get_benchmark_shapes("TimeIndexing")]
+
+    def setup(self, shape):
+        self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
+        trigger_import(self.df)
+        self.numeric_indexer = [0, 1]
+        self.labels_indexer = self.df.columns[self.numeric_indexer].tolist()
+
+    def time_iloc(self, shape):
+        execute(self.df.iloc[:, self.numeric_indexer])
+
+    def time_loc(self, shape):
+        execute(self.df.loc[:, self.labels_indexer])
+
+    def time___getitem__(self, shape):
+        execute(self.df[self.labels_indexer])
+
+
 class TimeMultiIndexing:
     param_names = ["shape"]
     params = [get_benchmark_shapes("TimeMultiIndexing")]

diff --git a/asv_bench/benchmarks/omnisci/benchmarks.py b/asv_bench/benchmarks/omnisci/benchmarks.py
@@ -32,6 +32,11 @@
 import numpy as np
 import pandas
 
+from ..benchmarks import (
+    TimeIndexing as TimeIndexingPandasExecution,
+    TimeIndexingColumns as TimeIndexingColumnsPandasExecution,
+)
+
 
 class TimeJoin:
     param_names = ["shape", "how", "is_equal_keys"]
@@ -344,29 +349,18 @@ def time_value_counts(self, shape, ngroups):
         execute(self.series.value_counts())
 
 
-class TimeIndexing:
-    param_names = ["shape", "indexer_type"]
+class TimeIndexing(TimeIndexingPandasExecution):
     params = [
         get_benchmark_shapes("omnisci.TimeIndexing"),
-        ["scalar", "bool", "slice", "list", "function"],
+        *TimeIndexingPandasExecution.params[1:],
     ]
 
-    def setup(self, shape, indexer_type):
-        self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
-        trigger_import(self.df)
-        self.indexer = {
-            "bool": [False, True] * (shape[0] // 2),
-            "scalar": shape[0] // 2,
-            "slice": slice(0, shape[0], 2),
-            "list": list(range(shape[0])),
-            "function": lambda df: df.index[::-2],
-        }[indexer_type]
-
-    def time_iloc(self, shape, indexer_type):
-        execute(self.df.iloc[self.indexer])
-
-    def time_loc(self, shape, indexer_type):
-        execute(self.df.loc[self.indexer])
+
+class TimeIndexingColumns(TimeIndexingColumnsPandasExecution):
+    params = [
+        get_benchmark_shapes("omnisci.TimeIndexing"),
+        *TimeIndexingColumnsPandasExecution.params[1:],
+    ]
 
 
 class TimeResetIndex:

diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py
@@ -462,22 +462,20 @@ def trigger_import(*dfs):
     *dfs : iterable
         DataFrames to trigger import.
     """
-    assert ASV_USE_STORAGE_FORMAT == "omnisci"
+    if ASV_USE_STORAGE_FORMAT != "omnisci" or ASV_USE_IMPL == "pandas":
+        return
 
     from modin.experimental.core.execution.native.implementations.omnisci_on_native.omnisci_worker import (
         OmnisciServer,
     )
 
     for df in dfs:
-        if ASV_USE_IMPL == "modin":
-            df.shape  # to trigger real execution
-            df._query_compiler._modin_frame._partitions[0][
-                0
-            ].frame_id = OmnisciServer().put_arrow_to_omnisci(
-                df._query_compiler._modin_frame._partitions[0][0].get()
-            )  # to trigger real execution
-        elif ASV_USE_IMPL == "pandas":
-            pass
+        df.shape  # to trigger real execution
+        df._query_compiler._modin_frame._partitions[0][
+            0
+        ].frame_id = OmnisciServer().put_arrow_to_omnisci(
+            df._query_compiler._modin_frame._partitions[0][0].get()
+        )  # to trigger real execution
 
 
 def execute(