Skip to content

Commit

Permalink
TEST-#3566: make Indexing benchmarks more representative (#3700)
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev committed Nov 29, 2021
1 parent d8963a6 commit 0582aa2
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 41 deletions.
69 changes: 57 additions & 12 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
execute,
translator_groupby_ngroups,
get_benchmark_shapes,
trigger_import,
)


Expand Down Expand Up @@ -497,29 +498,73 @@ class TimeIndexing:
param_names = ["shape", "indexer_type"]
params = [
get_benchmark_shapes("TimeIndexing"),
["scalar", "bool", "slice", "list", "function"],
[
"bool_array",
"bool_series",
"scalar",
"slice",
"continuous_slice",
"numpy_array_take_all_values",
"python_list_take_10_values",
"function",
],
]

indexer_getters = {
"bool_array": lambda df: np.array([False, True] * (len(df) // 2)),
# This boolean-Series is a projection of the source frame, it shouldn't
# be reimported or triggered to execute:
"bool_series": lambda df: df.iloc[:, 0] > 50,
"scalar": lambda df: len(df) // 2,
"slice": lambda df: slice(0, len(df), 2),
"continuous_slice": lambda df: slice(len(df) // 2),
"numpy_array_take_all_values": lambda df: np.arange(len(df)),
"python_list_take_10_values": lambda df: list(range(min(10, len(df)))),
"function": lambda df: (lambda df: df.index[::-2]),
}

def setup(self, shape, indexer_type):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
if indexer_type == "bool":
self.indexer = [False, True] * (shape[0] // 2)
elif indexer_type == "scalar":
self.indexer = shape[0] // 2
elif indexer_type == "slice":
self.indexer = slice(0, shape[0], 2)
elif indexer_type == "list":
self.indexer = [x for x in range(shape[0])]
elif indexer_type == "function":
self.indexer = lambda df: df.index[::-2]
trigger_import(self.df)

self.indexer = self.indexer_getters[indexer_type](self.df)
if isinstance(self.indexer, (pd.Series, pd.DataFrame)):
# HACK: Triggering `dtypes` meta-data computation in advance,
# so it won't affect the `loc/iloc` time:
self.indexer.dtypes

def time_iloc(self, shape, indexer_type):
execute(self.df.iloc[self.indexer])
# Pandas doesn't implement `df.iloc[series boolean_mask]` and raises an exception on it.
# Replacing this with the semantically equivalent construction:
if indexer_type != "bool_series":
execute(self.df.iloc[self.indexer])
else:
execute(self.df[self.indexer])

def time_loc(self, shape, indexer_type):
execute(self.df.loc[self.indexer])


class TimeIndexingColumns:
param_names = ["shape"]
params = [get_benchmark_shapes("TimeIndexing")]

def setup(self, shape):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
trigger_import(self.df)
self.numeric_indexer = [0, 1]
self.labels_indexer = self.df.columns[self.numeric_indexer].tolist()

def time_iloc(self, shape):
execute(self.df.iloc[:, self.numeric_indexer])

def time_loc(self, shape):
execute(self.df.loc[:, self.labels_indexer])

def time___getitem__(self, shape):
execute(self.df[self.labels_indexer])


class TimeMultiIndexing:
param_names = ["shape"]
params = [get_benchmark_shapes("TimeMultiIndexing")]
Expand Down
32 changes: 13 additions & 19 deletions asv_bench/benchmarks/omnisci/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
import numpy as np
import pandas

from ..benchmarks import (
TimeIndexing as TimeIndexingPandasExecution,
TimeIndexingColumns as TimeIndexingColumnsPandasExecution,
)


class TimeJoin:
param_names = ["shape", "how", "is_equal_keys"]
Expand Down Expand Up @@ -344,29 +349,18 @@ def time_value_counts(self, shape, ngroups):
execute(self.series.value_counts())


class TimeIndexing:
param_names = ["shape", "indexer_type"]
class TimeIndexing(TimeIndexingPandasExecution):
params = [
get_benchmark_shapes("omnisci.TimeIndexing"),
["scalar", "bool", "slice", "list", "function"],
*TimeIndexingPandasExecution.params[1:],
]

def setup(self, shape, indexer_type):
self.df = generate_dataframe(ASV_USE_IMPL, "int", *shape, RAND_LOW, RAND_HIGH)
trigger_import(self.df)
self.indexer = {
"bool": [False, True] * (shape[0] // 2),
"scalar": shape[0] // 2,
"slice": slice(0, shape[0], 2),
"list": list(range(shape[0])),
"function": lambda df: df.index[::-2],
}[indexer_type]

def time_iloc(self, shape, indexer_type):
execute(self.df.iloc[self.indexer])

def time_loc(self, shape, indexer_type):
execute(self.df.loc[self.indexer])

class TimeIndexingColumns(TimeIndexingColumnsPandasExecution):
params = [
get_benchmark_shapes("omnisci.TimeIndexing"),
*TimeIndexingColumnsPandasExecution.params[1:],
]


class TimeResetIndex:
Expand Down
18 changes: 8 additions & 10 deletions asv_bench/benchmarks/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,22 +462,20 @@ def trigger_import(*dfs):
*dfs : iterable
DataFrames to trigger import.
"""
assert ASV_USE_STORAGE_FORMAT == "omnisci"
if ASV_USE_STORAGE_FORMAT != "omnisci" or ASV_USE_IMPL == "pandas":
return

from modin.experimental.core.execution.native.implementations.omnisci_on_native.omnisci_worker import (
OmnisciServer,
)

for df in dfs:
if ASV_USE_IMPL == "modin":
df.shape # to trigger real execution
df._query_compiler._modin_frame._partitions[0][
0
].frame_id = OmnisciServer().put_arrow_to_omnisci(
df._query_compiler._modin_frame._partitions[0][0].get()
) # to trigger real execution
elif ASV_USE_IMPL == "pandas":
pass
df.shape # to trigger real execution
df._query_compiler._modin_frame._partitions[0][
0
].frame_id = OmnisciServer().put_arrow_to_omnisci(
df._query_compiler._modin_frame._partitions[0][0].get()
) # to trigger real execution


def execute(
Expand Down

0 comments on commit 0582aa2

Please sign in to comment.