Skip to content

Commit

Permalink
FIX-#3571: fallback to 'sort=True' on categorical keys in groupby (#3715
Browse files Browse the repository at this point in the history
)

Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
  • Loading branch information
dchigarev committed Nov 29, 2021
1 parent 9b7ef2a commit c67a7c5
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 1 deletion.
16 changes: 16 additions & 0 deletions modin/core/dataframe/algebra/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .map_reduce import MapReduce
from .default2pandas.groupby import GroupBy
from modin.utils import try_cast_to_pandas, hashable
from modin.error_message import ErrorMessage


class GroupByReduce(MapReduce):
Expand Down Expand Up @@ -290,6 +291,21 @@ def caller(
)
assert axis == 0, "Can only groupby reduce with axis=0"

# The bug only occurs in the case of Categorical 'by', so we might want to check whether any of
# the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes'
# computation if they're not computed may take time, so we don't do it
if not groupby_args.get("sort", True) and isinstance(by, type(query_compiler)):
ErrorMessage.missmatch_with_pandas(
operation="df.groupby(categorical_by, sort=False)",
message=(
"the groupby keys will be sorted anyway, although the 'sort=False' was passed. "
"See the following issue for more details: "
"https://github.com/modin-project/modin/issues/3571"
),
)
groupby_args = groupby_args.copy()
groupby_args["sort"] = True

if numeric_only:
qc = query_compiler.getitem_column_array(
query_compiler._modin_frame.numeric_columns(True)
Expand Down
2 changes: 1 addition & 1 deletion modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2894,7 +2894,7 @@ def value_counts(
):
if subset is None:
subset = self._query_compiler.columns
counted_values = self.groupby(by=subset, sort=False, dropna=dropna).size()
counted_values = self.groupby(by=subset, dropna=dropna, observed=True).size()
if sort:
counted_values.sort_values(ascending=ascending, inplace=True)
if normalize:
Expand Down
12 changes: 12 additions & 0 deletions modin/pandas/test/dataframe/test_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,15 @@ def comparator(md_res, pd_res):
# visit modin-issue#3388 for more info.
check_exception_type=None if sort and ascending is None else True,
)


def test_value_counts_categorical():
# from issue #3571
data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000)
random_state = np.random.RandomState(seed=42)
random_state.shuffle(data)

eval_general(
*create_test_dfs({"col1": data, "col2": data}, dtype="category"),
lambda df: df.value_counts(),
)
26 changes: 26 additions & 0 deletions modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1731,6 +1731,32 @@ def test_not_str_by(by, as_index):
eval_general(md_grp, pd_grp, lambda grp: grp.first())


@pytest.mark.parametrize("sort", [True, False])
@pytest.mark.parametrize("is_categorical_by", [True, False])
def test_groupby_sort(sort, is_categorical_by):
# from issue #3571
by = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000)
random_state = np.random.RandomState(seed=42)
random_state.shuffle(by)

data = {"key_col": by, "data_col": np.arange(len(by))}
md_df, pd_df = create_test_dfs(data)

if is_categorical_by:
md_df = md_df.astype({"key_col": "category"})
pd_df = pd_df.astype({"key_col": "category"})

md_grp = md_df.groupby("key_col", sort=sort)
pd_grp = pd_df.groupby("key_col", sort=sort)

modin_groupby_equals_pandas(md_grp, pd_grp)
eval_general(md_grp, pd_grp, lambda grp: grp.sum())
eval_general(md_grp, pd_grp, lambda grp: grp.size())
eval_general(md_grp, pd_grp, lambda grp: grp.agg(lambda df: df.mean()))
eval_general(md_grp, pd_grp, lambda grp: grp.dtypes)
eval_general(md_grp, pd_grp, lambda grp: grp.first())


def test_sum_with_level():
data = {
"A": ["0.0", "1.0", "2.0", "3.0", "4.0"],
Expand Down
12 changes: 12 additions & 0 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3449,6 +3449,18 @@ def sort_sensitive_comparator(df1, df2):
)


def test_value_counts_categorical():
# from issue #3571
data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000)
random_state = np.random.RandomState(seed=42)
random_state.shuffle(data)

eval_general(
*create_test_series(data, dtype="category"),
lambda df: df.value_counts(),
)


@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_values(data):
modin_series, pandas_series = create_test_series(data)
Expand Down

0 comments on commit c67a7c5

Please sign in to comment.