FIX-#3498 Fix groupby to use kwargs for getting the groups (#3523)

Co-authored-by: Devin Petersohn <devin.petersohn@gmail.com> Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com> Co-authored-by: Dmitry Chigarev <dmitry.chigarev@intel.com> Signed-off-by: Andreas Haller <Andreas.Haller@medisite.de>
modin-project · Dec 17, 2021 · 144a613 · 144a613
1 parent cc95ae2
commit 144a613
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 10 deletions.
diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py
@@ -846,6 +846,9 @@ def _compute_index_grouped(self, numerical=False):
         --------
         pandas.core.groupby.GroupBy.groups
         """
+        # We end up using pure pandas to compute group indices, so raising a warning
+        ErrorMessage.default_to_pandas("Group indices computation")
+
         # Splitting level-by and column-by since we serialize them in a different ways
         by = None
         level = []
@@ -866,6 +869,8 @@ def _compute_index_grouped(self, numerical=False):
             by = self._by
 
         is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)
+        # `dropna` param is the only one that matters for the group indices result
+        dropna = self._kwargs.get("dropna", True)
 
         if hasattr(self._by, "columns") and is_multi_by:
             by = list(self._by.columns)
@@ -875,7 +880,6 @@ def _compute_index_grouped(self, numerical=False):
             # end up using pandas implementation. Add the warning so the user is
             # aware.
             ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
-            ErrorMessage.default_to_pandas("Groupby with multiple columns")
             if isinstance(by, list) and all(
                 is_label(self._df, o, self._axis) for o in by
             ):
@@ -886,7 +890,7 @@ def _compute_index_grouped(self, numerical=False):
                 by = try_cast_to_pandas(by, squeeze=True)
                 pandas_df = self._df._to_pandas()
             by = wrap_into_list(by, level)
-            groupby_obj = pandas_df.groupby(by=by)
+            groupby_obj = pandas_df.groupby(by=by, dropna=dropna)
             return groupby_obj.indices if numerical else groupby_obj.groups
         else:
             if isinstance(self._by, type(self._query_compiler)):
@@ -908,7 +912,13 @@ def _compute_index_grouped(self, numerical=False):
                 # Since we want positional indices of the groups, we want to group
                 # on a `RangeIndex`, not on the actual index labels
                 axis_labels = pandas.RangeIndex(len(axis_labels))
-            return axis_labels.groupby(by)
+            # `pandas.Index.groupby` doesn't take any parameters except `by`.
+            # Have to convert an Index to a Series to be able to process `dropna=False`:
+            if dropna:
+                return axis_labels.groupby(by)
+            else:
+                groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna)
+                return groupby_obj.indices if numerical else groupby_obj.groups
 
     def _wrap_aggregation(
         self, qc_method, default_func, drop=True, numeric_only=True, **kwargs

diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py
@@ -30,6 +30,7 @@
     generate_multiindex,
     test_groupby_data,
     dict_equals,
+    value_equals,
 )
 from modin.config import NPartitions
 
@@ -45,7 +46,7 @@ def modin_groupby_equals_pandas(modin_groupby, pandas_groupby):
     )
 
     for g1, g2 in itertools.zip_longest(modin_groupby, pandas_groupby):
-        assert g1[0] == g2[0]
+        value_equals(g1[0], g2[0])
         df_equals(g1[1], g2[1])
 
 
@@ -1350,6 +1351,74 @@ def test_groupby_multiindex(groupby_kwargs):
     df_equals(md_grp.first(), pd_grp.first())
 
 
+@pytest.mark.parametrize("dropna", [True, False])
+@pytest.mark.parametrize(
+    "groupby_kwargs",
+    [
+        pytest.param({"level": 1, "axis": 1}, id="level_idx_axis=1"),
+        pytest.param({"level": 1}, id="level_idx"),
+        pytest.param({"level": [1, "four"]}, id="level_idx+name"),
+        pytest.param({"by": "four"}, id="level_name"),
+        pytest.param({"by": ["one", "two"]}, id="level_name_multi_by"),
+        pytest.param(
+            {"by": ["item0", "one", "two"]},
+            id="col_name+level_name",
+        ),
+        pytest.param({"by": ["item0"]}, id="col_name"),
+        pytest.param(
+            {"by": ["item0", "item1"]},
+            id="col_name_multi_by",
+        ),
+    ],
+)
+def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna):
+    modin_df = pd.DataFrame(test_data["float_nan_data"])
+    pandas_df = pandas.DataFrame(test_data["float_nan_data"])
+
+    new_index = pandas.Index([f"item{i}" for i in range(len(pandas_df))])
+    new_columns = pandas.MultiIndex.from_tuples(
+        [(i // 4, i // 2, i) for i in range(len(modin_df.columns))],
+        names=["four", "two", "one"],
+    )
+    modin_df.columns = new_columns
+    modin_df.index = new_index
+    pandas_df.columns = new_columns
+    pandas_df.index = new_index
+
+    if groupby_kwargs.get("axis", 0) == 0:
+        modin_df = modin_df.T
+        pandas_df = pandas_df.T
+
+    md_grp, pd_grp = modin_df.groupby(
+        **groupby_kwargs, dropna=dropna
+    ), pandas_df.groupby(**groupby_kwargs, dropna=dropna)
+    modin_groupby_equals_pandas(md_grp, pd_grp)
+
+    by_kwarg = groupby_kwargs.get("by", [])
+    # Disabled because of broken `dropna=False` for MapReduce implemented aggs:
+    # https://github.com/modin-project/modin/issues/3817
+    if not (
+        not dropna
+        and len(by_kwarg) > 1
+        and any(col in modin_df.columns for col in by_kwarg)
+    ):
+        df_equals(md_grp.sum(), pd_grp.sum())
+        df_equals(md_grp.size(), pd_grp.size())
+    # Grouping on level works incorrect in case of aggregation:
+    # https://github.com/modin-project/modin/issues/2912
+    # "BaseOnPython" tests are disabled because of the bug:
+    # https://github.com/modin-project/modin/issues/3827
+    if get_current_execution() != "BaseOnPython" and any(
+        col in modin_df.columns for col in by_kwarg
+    ):
+        df_equals(md_grp.quantile(), pd_grp.quantile())
+    # Default-to-pandas tests are disabled for multi-column 'by' because of the bug:
+    # https://github.com/modin-project/modin/issues/3827
+    if not (not dropna and len(by_kwarg) > 1):
+        df_equals(md_grp.first(), pd_grp.first())
+        df_equals(md_grp._default_to_pandas(lambda df: df.sum()), pd_grp.sum())
+
+
 @pytest.mark.parametrize("groupby_axis", [0, 1])
 @pytest.mark.parametrize("shift_axis", [0, 1])
 def test_shift_freq(groupby_axis, shift_axis):

diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
@@ -1361,12 +1361,16 @@ def _make_default_file(filename=None, nrows=NROWS, ncols=2, force=True, **kwargs
     return _make_default_file, filenames
 
 
+def value_equals(obj1, obj2):
+    """Check wherher two scalar or list-like values are equal and raise an ``AssertionError`` if they aren't."""
+    if is_list_like(obj1):
+        np.testing.assert_array_equal(obj1, obj2)
+    else:
+        assert (obj1 == obj2) or (np.isnan(obj1) and np.isnan(obj2))
+
+
 def dict_equals(dict1, dict2):
     """Check whether two dictionaries are equal and raise an ``AssertionError`` if they aren't."""
     for key1, key2 in itertools.zip_longest(sorted(dict1), sorted(dict2)):
-        assert (key1 == key2) or (np.isnan(key1) and np.isnan(key2))
-        value1, value2 = dict1[key1], dict2[key2]
-        if is_list_like(value1):
-            np.testing.assert_array_equal(value1, value2)
-        else:
-            assert value1 == value2
+        value_equals(key1, key2)
+        value_equals(dict1[key1], dict2[key2])