Skip to content

Commit

Permalink
BUG: Fix groupby with as_index for categorical multi groupers pandas-…
Browse files Browse the repository at this point in the history
…dev#13204

BUG: Fix string repr of Grouping
  • Loading branch information
pijucha committed Jul 2, 2016
1 parent a01644c commit 374402c
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -527,3 +527,4 @@ Bug Fixes


- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)
38 changes: 34 additions & 4 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2250,7 +2250,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.grouper = to_timedelta(self.grouper)

def __repr__(self):
return 'Grouping(%s)' % self.name
return 'Grouping({0})'.format(self.name)

def __iter__(self):
return iter(self.indices)
Expand Down Expand Up @@ -3741,9 +3741,39 @@ def _reindex_output(self, result):
return result

levels_list = [ping.group_index for ping in groupings]
index = MultiIndex.from_product(levels_list, names=self.grouper.names)
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
return result.reindex(**d).sortlevel(axis=self.axis)
index, _ = MultiIndex.from_product(
levels_list, names=self.grouper.names).sortlevel()

if self.as_index:
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
return result.reindex(**d)

# GH 13204
# Here, the categorical in-axis groupers, which need to be fully
# expanded, are columns in `result`. An idea is to do:
# result = result.set_index(self.grouper.names)
# .reindex(index).reset_index()
# but special care has to be taken because of possible not-in-axis
# groupers.
# So, we manually select and drop the in-axis grouper columns,
# reindex `result`, and then reset the in-axis grouper columns.

# Select in-axis groupers
in_axis_grps = [(i, ping.name) for (i, ping)
in enumerate(groupings) if ping.in_axis]
g_nums, g_names = zip(*in_axis_grps)

result = result.drop(labels=list(g_names), axis=1)

# Set a temp index and reindex (possibly expanding)
result = result.set_index(self.grouper.result_index
).reindex(index, copy=False)

# Reset in-axis grouper columns
# (using level numbers `g_nums` because level names may not be unique)
result = result.reset_index(level=g_nums)

return result.reset_index(drop=True)

def _iterate_column_groupbys(self):
for i, colname in enumerate(self._selected_obj.columns):
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -6304,6 +6304,47 @@ def test_groupby_categorical_two_columns(self):
nan, nan, nan, nan, 200, 34]}, index=idx)
tm.assert_frame_equal(res, exp)

def test_groupby_multi_categorical_as_index(self):
# GH13204
df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
'A': [10, 11, 11],
'B': [101, 102, 103]})
result = df.groupby(['cat', 'A'], as_index=False).sum()
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
'A': [10, 11, 10, 11, 10, 11],
'B': [101.0, nan, nan, 205.0, nan, nan]},
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# function grouper
f = lambda r: df.loc[r, 'A']
result = df.groupby(['cat', f], as_index=False).sum()
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
'A': [10.0, nan, nan, 22.0, nan, nan],
'B': [101.0, nan, nan, 205.0, nan, nan]},
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# another not in-axis grouper (conflicting names in index)
s = Series(['a', 'b', 'b'], name='cat')
result = df.groupby(['cat', s], as_index=False).sum()
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
'A': [10.0, nan, nan, 22.0, nan, nan],
'B': [101.0, nan, nan, 205.0, nan, nan]},
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)

# is original index dropped?
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
'A': [10, 11, 10, 11, 10, 11],
'B': [101.0, nan, nan, 205.0, nan, nan]},
columns=['cat', 'A', 'B'])

for name in [None, 'X', 'B', 'cat']:
df.index = Index(list("abc"), name=name)
result = df.groupby(['cat', 'A'], as_index=False).sum()
tm.assert_frame_equal(result, expected, check_index_type=True)

def test_groupby_apply_all_none(self):
# Tests to make sure no errors if apply function returns all None
# values. Issue 9684.
Expand Down Expand Up @@ -6431,6 +6472,16 @@ def test_numpy_compat(self):
tm.assertRaisesRegexp(UnsupportedFunctionCall, msg,
getattr(g, func), foo=1)

def test_grouping_string_repr(self):
# GH 13394
mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
df = DataFrame([[1, 2, 3]], columns=mi)
gr = df.groupby(df[('A', 'a')])

result = gr.grouper.groupings[0].__repr__()
expected = "Grouping(('A', 'a'))"
tm.assert_equal(result, expected)


def assert_fp_equal(a, b):
assert (np.abs(a - b) < 1e-12).all()
Expand Down

0 comments on commit 374402c

Please sign in to comment.