BUG: resample with nunique

closes #12352 Author: Jeff Reback <jeff@reback.net> Closes #12364 from jreback/resample_fix and squashes the following commits: d19c5fe [Jeff Reback] BUG: resample with nunique
pandas-dev · Feb 17, 2016 · f1aad46 · f1aad46
1 parent 5d1857c
commit f1aad46
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 3 deletions.
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -1035,7 +1035,7 @@ Bug Fixes
 - Bug in ``pd.concat`` while concatenating tz-aware NaT series. (:issue:`11693`, :issue:`11755`, :issue:`12217`)
 - Bug in ``pd.read_stata`` with version <= 108 files (:issue:`12232`)
 - Bug in ``Series.resample`` using a frequency of ``Nano`` when the index is a ``DatetimeIndex`` and contains non-zero nanosecond parts (:issue:`12037`)
-
+- Bug in resampling with ``.nunique`` and a sparse index (:issue:`12352`)
 
 - Bug in ``NaT`` subtraction from ``Timestamp`` or ``DatetimeIndex`` with timezones (:issue:`11718`)
 - Bug in subtraction of ``Series`` of a single tz-aware ``Timestamp`` (:issue:`12290`)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2819,8 +2819,16 @@ def nunique(self, dropna=True):
             inc[idx] = 1
 
         out = np.add.reduceat(inc, idx).astype('int64', copy=False)
-        return Series(out if ids[0] != -1 else out[1:],
-                      index=self.grouper.result_index,
+        res = out if ids[0] != -1 else out[1:]
+        ri = self.grouper.result_index
+
+        # we might have duplications among the bins
+        if len(res) != len(ri):
+            res, out = np.zeros(len(ri), dtype=out.dtype), res
+            res[ids] = out
+
+        return Series(res,
+                      index=ri,
                       name=self.name)
 
     @deprecate_kwarg('take_last', 'keep',

diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py
@@ -1526,6 +1526,34 @@ def test_resample_timegrouper(self):
             result = df.groupby(pd.Grouper(freq='M', key='A')).count()
             assert_frame_equal(result, expected)
 
+    def test_resample_nunique(self):
+
+        # GH 12352
+        df = DataFrame({
+            'ID': {pd.Timestamp('2015-06-05 00:00:00'): '0010100903',
+                   pd.Timestamp('2015-06-08 00:00:00'): '0010150847'},
+            'DATE': {pd.Timestamp('2015-06-05 00:00:00'): '2015-06-05',
+                     pd.Timestamp('2015-06-08 00:00:00'): '2015-06-08'}})
+        r = df.resample('D')
+        g = df.groupby(pd.Grouper(freq='D'))
+        expected = df.groupby(pd.TimeGrouper('D')).ID.apply(lambda x:
+                                                            x.nunique())
+        self.assertEqual(expected.name, 'ID')
+
+        for t in [r, g]:
+            result = r.ID.nunique()
+            assert_series_equal(result, expected)
+
+        # TODO
+        # this should have name
+        # https://github.com/pydata/pandas/issues/12363
+        expected.name = None
+        result = df.ID.resample('D').nunique()
+        assert_series_equal(result, expected)
+
+        result = df.ID.groupby(pd.Grouper(freq='D')).nunique()
+        assert_series_equal(result, expected)
+
     def test_resample_group_info(self):  # GH10914
         for n, k in product((10000, 100000), (10, 100, 1000)):
             dr = date_range(start='2015-08-27', periods=n // 10, freq='T')