Skip to content

Commit

Permalink
FIX-#3498 Fix groupby to use kwargs for getting the groups (#3523)
Browse files Browse the repository at this point in the history
Co-authored-by: Devin Petersohn <devin.petersohn@gmail.com>
Co-authored-by: Vasily Litvinov <vasilij.n.litvinov@intel.com>
Co-authored-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
Signed-off-by: Andreas Haller <Andreas.Haller@medisite.de>
  • Loading branch information
4 people committed Dec 17, 2021
1 parent cc95ae2 commit 144a613
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 10 deletions.
16 changes: 13 additions & 3 deletions modin/pandas/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,9 @@ def _compute_index_grouped(self, numerical=False):
--------
pandas.core.groupby.GroupBy.groups
"""
# We end up using pure pandas to compute group indices, so raising a warning
ErrorMessage.default_to_pandas("Group indices computation")

# Splitting level-by and column-by since we serialize them in a different ways
by = None
level = []
Expand All @@ -866,6 +869,8 @@ def _compute_index_grouped(self, numerical=False):
by = self._by

is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)
# `dropna` param is the only one that matters for the group indices result
dropna = self._kwargs.get("dropna", True)

if hasattr(self._by, "columns") and is_multi_by:
by = list(self._by.columns)
Expand All @@ -875,7 +880,6 @@ def _compute_index_grouped(self, numerical=False):
# end up using pandas implementation. Add the warning so the user is
# aware.
ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
ErrorMessage.default_to_pandas("Groupby with multiple columns")
if isinstance(by, list) and all(
is_label(self._df, o, self._axis) for o in by
):
Expand All @@ -886,7 +890,7 @@ def _compute_index_grouped(self, numerical=False):
by = try_cast_to_pandas(by, squeeze=True)
pandas_df = self._df._to_pandas()
by = wrap_into_list(by, level)
groupby_obj = pandas_df.groupby(by=by)
groupby_obj = pandas_df.groupby(by=by, dropna=dropna)
return groupby_obj.indices if numerical else groupby_obj.groups
else:
if isinstance(self._by, type(self._query_compiler)):
Expand All @@ -908,7 +912,13 @@ def _compute_index_grouped(self, numerical=False):
# Since we want positional indices of the groups, we want to group
# on a `RangeIndex`, not on the actual index labels
axis_labels = pandas.RangeIndex(len(axis_labels))
return axis_labels.groupby(by)
# `pandas.Index.groupby` doesn't take any parameters except `by`.
# Have to convert an Index to a Series to be able to process `dropna=False`:
if dropna:
return axis_labels.groupby(by)
else:
groupby_obj = axis_labels.to_series().groupby(by, dropna=dropna)
return groupby_obj.indices if numerical else groupby_obj.groups

def _wrap_aggregation(
self, qc_method, default_func, drop=True, numeric_only=True, **kwargs
Expand Down
71 changes: 70 additions & 1 deletion modin/pandas/test/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
generate_multiindex,
test_groupby_data,
dict_equals,
value_equals,
)
from modin.config import NPartitions

Expand All @@ -45,7 +46,7 @@ def modin_groupby_equals_pandas(modin_groupby, pandas_groupby):
)

for g1, g2 in itertools.zip_longest(modin_groupby, pandas_groupby):
assert g1[0] == g2[0]
value_equals(g1[0], g2[0])
df_equals(g1[1], g2[1])


Expand Down Expand Up @@ -1350,6 +1351,74 @@ def test_groupby_multiindex(groupby_kwargs):
df_equals(md_grp.first(), pd_grp.first())


@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize(
"groupby_kwargs",
[
pytest.param({"level": 1, "axis": 1}, id="level_idx_axis=1"),
pytest.param({"level": 1}, id="level_idx"),
pytest.param({"level": [1, "four"]}, id="level_idx+name"),
pytest.param({"by": "four"}, id="level_name"),
pytest.param({"by": ["one", "two"]}, id="level_name_multi_by"),
pytest.param(
{"by": ["item0", "one", "two"]},
id="col_name+level_name",
),
pytest.param({"by": ["item0"]}, id="col_name"),
pytest.param(
{"by": ["item0", "item1"]},
id="col_name_multi_by",
),
],
)
def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna):
modin_df = pd.DataFrame(test_data["float_nan_data"])
pandas_df = pandas.DataFrame(test_data["float_nan_data"])

new_index = pandas.Index([f"item{i}" for i in range(len(pandas_df))])
new_columns = pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(modin_df.columns))],
names=["four", "two", "one"],
)
modin_df.columns = new_columns
modin_df.index = new_index
pandas_df.columns = new_columns
pandas_df.index = new_index

if groupby_kwargs.get("axis", 0) == 0:
modin_df = modin_df.T
pandas_df = pandas_df.T

md_grp, pd_grp = modin_df.groupby(
**groupby_kwargs, dropna=dropna
), pandas_df.groupby(**groupby_kwargs, dropna=dropna)
modin_groupby_equals_pandas(md_grp, pd_grp)

by_kwarg = groupby_kwargs.get("by", [])
# Disabled because of broken `dropna=False` for MapReduce implemented aggs:
# https://github.com/modin-project/modin/issues/3817
if not (
not dropna
and len(by_kwarg) > 1
and any(col in modin_df.columns for col in by_kwarg)
):
df_equals(md_grp.sum(), pd_grp.sum())
df_equals(md_grp.size(), pd_grp.size())
# Grouping on level works incorrect in case of aggregation:
# https://github.com/modin-project/modin/issues/2912
# "BaseOnPython" tests are disabled because of the bug:
# https://github.com/modin-project/modin/issues/3827
if get_current_execution() != "BaseOnPython" and any(
col in modin_df.columns for col in by_kwarg
):
df_equals(md_grp.quantile(), pd_grp.quantile())
# Default-to-pandas tests are disabled for multi-column 'by' because of the bug:
# https://github.com/modin-project/modin/issues/3827
if not (not dropna and len(by_kwarg) > 1):
df_equals(md_grp.first(), pd_grp.first())
df_equals(md_grp._default_to_pandas(lambda df: df.sum()), pd_grp.sum())


@pytest.mark.parametrize("groupby_axis", [0, 1])
@pytest.mark.parametrize("shift_axis", [0, 1])
def test_shift_freq(groupby_axis, shift_axis):
Expand Down
16 changes: 10 additions & 6 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1361,12 +1361,16 @@ def _make_default_file(filename=None, nrows=NROWS, ncols=2, force=True, **kwargs
return _make_default_file, filenames


def value_equals(obj1, obj2):
"""Check wherher two scalar or list-like values are equal and raise an ``AssertionError`` if they aren't."""
if is_list_like(obj1):
np.testing.assert_array_equal(obj1, obj2)
else:
assert (obj1 == obj2) or (np.isnan(obj1) and np.isnan(obj2))


def dict_equals(dict1, dict2):
"""Check whether two dictionaries are equal and raise an ``AssertionError`` if they aren't."""
for key1, key2 in itertools.zip_longest(sorted(dict1), sorted(dict2)):
assert (key1 == key2) or (np.isnan(key1) and np.isnan(key2))
value1, value2 = dict1[key1], dict2[key2]
if is_list_like(value1):
np.testing.assert_array_equal(value1, value2)
else:
assert value1 == value2
value_equals(key1, key2)
value_equals(dict1[key1], dict2[key2])

0 comments on commit 144a613

Please sign in to comment.