BUG: Fix groupby with as_index for categorical multi groupers pandas-…

…dev#13204 BUG: Fix string repr of Grouping
pijucha · Jul 2, 2016 · 374402c · 374402c
1 parent a01644c
commit 374402c
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 4 deletions.
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -527,3 +527,4 @@ Bug Fixes
 
 
 - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
+- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2250,7 +2250,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
                 self.grouper = to_timedelta(self.grouper)
 
     def __repr__(self):
-        return 'Grouping(%s)' % self.name
+        return 'Grouping({0})'.format(self.name)
 
     def __iter__(self):
         return iter(self.indices)
@@ -3741,9 +3741,39 @@ def _reindex_output(self, result):
             return result
 
         levels_list = [ping.group_index for ping in groupings]
-        index = MultiIndex.from_product(levels_list, names=self.grouper.names)
-        d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
-        return result.reindex(**d).sortlevel(axis=self.axis)
+        index, _ = MultiIndex.from_product(
+            levels_list, names=self.grouper.names).sortlevel()
+
+        if self.as_index:
+            d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
+            return result.reindex(**d)
+
+        # GH 13204
+        # Here, the categorical in-axis groupers, which need to be fully
+        # expanded, are columns in `result`. An idea is to do:
+        # result = result.set_index(self.grouper.names)
+        #                .reindex(index).reset_index()
+        # but special care has to be taken because of possible not-in-axis
+        # groupers.
+        # So, we manually select and drop the in-axis grouper columns,
+        # reindex `result`, and then reset the in-axis grouper columns.
+
+        # Select in-axis groupers
+        in_axis_grps = [(i, ping.name) for (i, ping)
+                        in enumerate(groupings) if ping.in_axis]
+        g_nums, g_names = zip(*in_axis_grps)
+
+        result = result.drop(labels=list(g_names), axis=1)
+
+        # Set a temp index and reindex (possibly expanding)
+        result = result.set_index(self.grouper.result_index
+                                  ).reindex(index, copy=False)
+
+        # Reset in-axis grouper columns
+        # (using level numbers `g_nums` because level names may not be unique)
+        result = result.reset_index(level=g_nums)
+
+        return result.reset_index(drop=True)
 
     def _iterate_column_groupbys(self):
         for i, colname in enumerate(self._selected_obj.columns):

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -6304,6 +6304,47 @@ def test_groupby_categorical_two_columns(self):
                                 nan, nan, nan, nan, 200, 34]}, index=idx)
         tm.assert_frame_equal(res, exp)
 
+    def test_groupby_multi_categorical_as_index(self):
+        # GH13204
+        df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
+                        'A': [10, 11, 11],
+                        'B': [101, 102, 103]})
+        result = df.groupby(['cat', 'A'], as_index=False).sum()
+        expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
+                              'A': [10, 11, 10, 11, 10, 11],
+                              'B': [101.0, nan, nan, 205.0, nan, nan]},
+                             columns=['cat', 'A', 'B'])
+        tm.assert_frame_equal(result, expected)
+
+        # function grouper
+        f = lambda r: df.loc[r, 'A']
+        result = df.groupby(['cat', f], as_index=False).sum()
+        expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
+                              'A': [10.0, nan, nan, 22.0, nan, nan],
+                              'B': [101.0, nan, nan, 205.0, nan, nan]},
+                             columns=['cat', 'A', 'B'])
+        tm.assert_frame_equal(result, expected)
+
+        # another not in-axis grouper (conflicting names in index)
+        s = Series(['a', 'b', 'b'], name='cat')
+        result = df.groupby(['cat', s], as_index=False).sum()
+        expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
+                              'A': [10.0, nan, nan, 22.0, nan, nan],
+                              'B': [101.0, nan, nan, 205.0, nan, nan]},
+                             columns=['cat', 'A', 'B'])
+        tm.assert_frame_equal(result, expected)
+
+        # is original index dropped?
+        expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
+                              'A': [10, 11, 10, 11, 10, 11],
+                              'B': [101.0, nan, nan, 205.0, nan, nan]},
+                             columns=['cat', 'A', 'B'])
+
+        for name in [None, 'X', 'B', 'cat']:
+            df.index = Index(list("abc"), name=name)
+            result = df.groupby(['cat', 'A'], as_index=False).sum()
+            tm.assert_frame_equal(result, expected, check_index_type=True)
+
     def test_groupby_apply_all_none(self):
         # Tests to make sure no errors if apply function returns all None
         # values. Issue 9684.
@@ -6431,6 +6472,16 @@ def test_numpy_compat(self):
             tm.assertRaisesRegexp(UnsupportedFunctionCall, msg,
                                   getattr(g, func), foo=1)
 
+    def test_grouping_string_repr(self):
+        # GH 13394
+        mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
+        df = DataFrame([[1, 2, 3]], columns=mi)
+        gr = df.groupby(df[('A', 'a')])
+
+        result = gr.grouper.groupings[0].__repr__()
+        expected = "Grouping(('A', 'a'))"
+        tm.assert_equal(result, expected)
+
 
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()