BUG: group_shift_indexer checks for null group keys

closes #13813 Author: Ivan Nazarov <ivannnnz@gmail.com> Closes #13819 from ivannz/issue13813fix and squashes the following commits: bddf799 [Ivan Nazarov] Switched from float('nan') to np.nan eab8038 [Ivan Nazarov] Added bugfix description [ci skip] d92cf3c [Ivan Nazarov] minor flake8 style corrections 94bae0b [Ivan Nazarov] Patched the template, and added a test for '.shift()' fe2f0ec [Ivan Nazarov] Treat incomplete group keys as distinct when shifting 966d5c6 [Ivan Nazarov] BUG: group_shift_indexer checks for null group keys
pandas-dev · Jul 29, 2016 · 54b2777 · 54b2777
1 parent 748787d
commit 54b2777
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 17 deletions.
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -266,37 +266,40 @@ New Index methods
 
 Following methods and options are added to ``Index`` to be more consistent with ``Series`` and ``DataFrame``.
 
-- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
+``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
 
-  .. ipython:: python
+.. ipython:: python
 
-     idx = pd.Index(['a', 'b', 'c'])
-     idx.where([True, False, True])
+   idx = pd.Index(['a', 'b', 'c'])
+   idx.where([True, False, True])
 
 
-- ``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`)
+``Index`` now supports ``.dropna`` to exclude missing values (:issue:`6194`)
 
-  .. ipython:: python
+.. ipython:: python
 
-     idx = pd.Index([1, 2, np.nan, 4])
-     idx.dropna()
+   idx = pd.Index([1, 2, np.nan, 4])
+   idx.dropna()
 
 For ``MultiIndex``, values are dropped if any level is missing by default. Specifying
 ``how='all'`` only drops values where all levels are missing.
 
-     midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4],
+.. ipython:: python
+
+   midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4],
                                        [1, 2, np.nan, np.nan]])
-     midx
-     midx.dropna()
-     midx.dropna(how='all')
+   midx
+   midx.dropna()
+   midx.dropna(how='all')
 
-- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
-- ``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here <text.extractall>` (:issue:`10008`, :issue:`13156`)
+``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, the see :ref:`docs here <text.extractall>` (:issue:`10008`, :issue:`13156`)
 
-  .. ipython:: python
+.. ipython:: python
+
+   idx = pd.Index(["a1a2", "b1", "c1"])
+   idx.str.extractall("[ab](?P<digit>\d)")
 
-     idx = pd.Index(["a1a2", "b1", "c1"])
-     idx.str.extractall("[ab](?P<digit>\d)")
+``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
 
 .. _whatsnew_0190.enhancements.other:
 
@@ -736,6 +739,7 @@ Performance Improvements
 Bug Fixes
 ~~~~~~~~~
 
+- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
 - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
 - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
 - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)

diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi
@@ -1356,6 +1356,12 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
                 ## reverse iterator if shifting backwards
                 ii = offset + sign * i
                 lab = labels[ii]
+
+                # Skip null keys
+                if lab == -1:
+                    out[ii] = -1
+                    continue
+
                 label_seen[lab] += 1
 
                 idxer_slot = label_seen[lab] % periods

diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
@@ -700,6 +700,12 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
                 ## reverse iterator if shifting backwards
                 ii = offset + sign * i
                 lab = labels[ii]
+
+                # Skip null keys
+                if lab == -1:
+                    out[ii] = -1
+                    continue
+
                 label_seen[lab] += 1
 
                 idxer_slot = label_seen[lab] % periods

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -6560,6 +6560,27 @@ def test_grouping_string_repr(self):
         expected = "Grouping(('A', 'a'))"
         tm.assert_equal(result, expected)
 
+    def test_group_shift_with_null_key(self):
+        # This test is designed to replicate the segfault in issue #13813.
+        n_rows = 1200
+
+        # Generate a moderately large dataframe with occasional missing
+        # values in column `B`, and then group by [`A`, `B`]. This should
+        # force `-1` in `labels` array of `g.grouper.group_info` exactly
+        # at those places, where the group-by key is partilly missing.
+        df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
+                        for i in range(n_rows)], dtype=float,
+                       columns=["A", "B", "Z"], index=None)
+        g = df.groupby(["A", "B"])
+
+        expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
+                              else np.nan)
+                             for i in range(n_rows)], dtype=float,
+                             columns=["Z"], index=None)
+        result = g.shift(-1)
+
+        assert_frame_equal(result, expected)
+
 
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()