BUG: Spurious matches in DataFrame.duplicated when keep=False, #11864

pandas-dev · Jan 7, 2016 · b431f85 · b431f85
1 parent 6132df0
commit b431f85
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 1 deletion.
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -468,9 +468,11 @@ Bug Fixes
 - Bug in ``to_numeric`` where it does not raise if input is more than one dimension (:issue:`11776`)
 
 - Bug in parsing timezone offset strings with non-zero minutes (:issue:`11708`)
+
 - Bug in ``df.plot`` using incorrect colors for bar plots under matplotlib 1.5+ (:issue:`11614`)
 - Bug in the ``groupby`` ``plot`` method when using keyword arguments (:issue:`11805`).
 
+- Bug in ``DataFrame.duplicated`` and ``drop_duplicates`` causing spurious matches when setting ``keep=False`` (:issue:`11864`)
 
 - Bug in ``.loc`` result with duplicated key may have ``Index`` with incorrect dtype (:issue:`11497`)
 - Bug in ``pd.rolling_median`` where memory allocation failed even with sufficient memory (:issue:`11696`)

diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -1067,7 +1067,8 @@ def mode_int64(int64_t[:] values):
 @cython.boundscheck(False)
 def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
     cdef:
-        int ret = 0, value, k
+        int ret = 0, k
+        int64_t value
         Py_ssize_t i, n = len(values)
         kh_int64_t * table = kh_init_int64()
         ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -8532,6 +8532,13 @@ def test_drop_duplicates(self):
         df = pd.DataFrame([[-x, x], [x, x + 4]])
         assert_frame_equal(df.drop_duplicates(), df)
 
+        # GH 11864
+        df = pd.DataFrame([i] * 9 for i in range(16))
+        df = df.append([[1] + [0] * 8], ignore_index=True)
+
+        for keep in ['first', 'last', False]:
+            assert_equal(df.duplicated(keep=keep).sum(), 0)
+
     def test_drop_duplicates_for_take_all(self):
         df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
                                 'foo', 'bar', 'qux', 'foo'],