FIX-#3571: fallback to 'sort=True' on categorical keys in groupby (#3715

) Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
modin-project · Nov 29, 2021 · c67a7c5 · c67a7c5
1 parent 9b7ef2a
commit c67a7c5
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 1 deletion.
diff --git a/modin/core/dataframe/algebra/groupby.py b/modin/core/dataframe/algebra/groupby.py
@@ -18,6 +18,7 @@
 from .map_reduce import MapReduce
 from .default2pandas.groupby import GroupBy
 from modin.utils import try_cast_to_pandas, hashable
+from modin.error_message import ErrorMessage
 
 
 class GroupByReduce(MapReduce):
@@ -290,6 +291,21 @@ def caller(
             )
         assert axis == 0, "Can only groupby reduce with axis=0"
 
+        # The bug only occurs in the case of Categorical 'by', so we might want to check whether any of
+        # the 'by' dtypes is Categorical before going into this branch, however triggering 'dtypes'
+        # computation if they're not computed may take time, so we don't do it
+        if not groupby_args.get("sort", True) and isinstance(by, type(query_compiler)):
+            ErrorMessage.missmatch_with_pandas(
+                operation="df.groupby(categorical_by, sort=False)",
+                message=(
+                    "the groupby keys will be sorted anyway, although the 'sort=False' was passed. "
+                    "See the following issue for more details: "
+                    "https://github.com/modin-project/modin/issues/3571"
+                ),
+            )
+            groupby_args = groupby_args.copy()
+            groupby_args["sort"] = True
+
         if numeric_only:
             qc = query_compiler.getitem_column_array(
                 query_compiler._modin_frame.numeric_columns(True)

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -2894,7 +2894,7 @@ def value_counts(
     ):
         if subset is None:
             subset = self._query_compiler.columns
-        counted_values = self.groupby(by=subset, sort=False, dropna=dropna).size()
+        counted_values = self.groupby(by=subset, dropna=dropna, observed=True).size()
         if sort:
             counted_values.sort_values(ascending=ascending, inplace=True)
         if normalize:

diff --git a/modin/pandas/test/dataframe/test_reduction.py b/modin/pandas/test/dataframe/test_reduction.py
@@ -411,3 +411,15 @@ def comparator(md_res, pd_res):
         # visit modin-issue#3388 for more info.
         check_exception_type=None if sort and ascending is None else True,
     )
+
+
+def test_value_counts_categorical():
+    # from issue #3571
+    data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000)
+    random_state = np.random.RandomState(seed=42)
+    random_state.shuffle(data)
+
+    eval_general(
+        *create_test_dfs({"col1": data, "col2": data}, dtype="category"),
+        lambda df: df.value_counts(),
+    )
diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py
@@ -1731,6 +1731,32 @@ def test_not_str_by(by, as_index):
     eval_general(md_grp, pd_grp, lambda grp: grp.first())
 
 
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("is_categorical_by", [True, False])
+def test_groupby_sort(sort, is_categorical_by):
+    # from issue #3571
+    by = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000)
+    random_state = np.random.RandomState(seed=42)
+    random_state.shuffle(by)
+
+    data = {"key_col": by, "data_col": np.arange(len(by))}
+    md_df, pd_df = create_test_dfs(data)
+
+    if is_categorical_by:
+        md_df = md_df.astype({"key_col": "category"})
+        pd_df = pd_df.astype({"key_col": "category"})
+
+    md_grp = md_df.groupby("key_col", sort=sort)
+    pd_grp = pd_df.groupby("key_col", sort=sort)
+
+    modin_groupby_equals_pandas(md_grp, pd_grp)
+    eval_general(md_grp, pd_grp, lambda grp: grp.sum())
+    eval_general(md_grp, pd_grp, lambda grp: grp.size())
+    eval_general(md_grp, pd_grp, lambda grp: grp.agg(lambda df: df.mean()))
+    eval_general(md_grp, pd_grp, lambda grp: grp.dtypes)
+    eval_general(md_grp, pd_grp, lambda grp: grp.first())
+
+
 def test_sum_with_level():
     data = {
         "A": ["0.0", "1.0", "2.0", "3.0", "4.0"],

diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
@@ -3449,6 +3449,18 @@ def sort_sensitive_comparator(df1, df2):
     )
 
 
+def test_value_counts_categorical():
+    # from issue #3571
+    data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000)
+    random_state = np.random.RandomState(seed=42)
+    random_state.shuffle(data)
+
+    eval_general(
+        *create_test_series(data, dtype="category"),
+        lambda df: df.value_counts(),
+    )
+
+
 @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
 def test_values(data):
     modin_series, pandas_series = create_test_series(data)