Skip to content

Commit

Permalink
Merge pull request #10236 from sinhrks/duplicated
Browse files Browse the repository at this point in the history
ENH: duplicated and drop_duplicates now accept keep kw
  • Loading branch information
sinhrks committed Aug 8, 2015
2 parents 0259ace + 1b913ba commit 529288e
Show file tree
Hide file tree
Showing 14 changed files with 448 additions and 90 deletions.
10 changes: 6 additions & 4 deletions doc/source/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1178,17 +1178,19 @@ takes as an argument the columns to use to identify duplicated rows.
- ``drop_duplicates`` removes duplicate rows.

By default, the first observed row of a duplicate set is considered unique, but
each method has a ``take_last`` parameter that indicates the last observed row
should be taken instead.
each method has a ``keep`` parameter to specify targets to be kept.

.. ipython:: python
df2 = pd.DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
'c' : np.random.randn(7)})
df2.duplicated(['a','b'])
df2.duplicated(['a','b'], keep='last')
df2.duplicated(['a','b'], keep=False)
df2.drop_duplicates(['a','b'])
df2.drop_duplicates(['a','b'], take_last=True)
df2.drop_duplicates(['a','b'], keep='last')
df2.drop_duplicates(['a','b'], keep=False)
An alternative way to drop duplicates on the index is ``.groupby(level=0)`` combined with ``first()`` or ``last()``.

Expand All @@ -1199,7 +1201,7 @@ An alternative way to drop duplicates on the index is ``.groupby(level=0)`` comb
df3.groupby(level=0).first()
# a bit more verbose
df3.reset_index().drop_duplicates(subset='b', take_last=False).set_index('b')
df3.reset_index().drop_duplicates(subset='b', keep='first').set_index('b')
.. _indexing.dictionarylike:

Expand Down
10 changes: 10 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,15 @@ Other enhancements
- ``pd.merge`` will now allow duplicate column names if they are not merged upon (:issue:`10639`).

- ``pd.pivot`` will now allow passing index as ``None`` (:issue:`3962`).
- ``drop_duplicates`` and ``duplicated`` now accept ``keep`` keyword to target first, last, and all duplicates. ``take_last`` keyword is deprecated, see :ref:`deprecations <whatsnew_0170.deprecations>` (:issue:`6511`, :issue:`8505`)

.. ipython :: python

s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D'])
s.drop_duplicates()
s.drop_duplicates(keep='last')
s.drop_duplicates(keep=False)


.. _whatsnew_0170.api:

Expand Down Expand Up @@ -520,6 +529,7 @@ Deprecations
===================== =================================

- ``Categorical.name`` was deprecated to make ``Categorical`` more ``numpy.ndarray`` like. Use ``Series(cat, name="whatever")`` instead (:issue:`10482`).
- ``drop_duplicates`` and ``duplicated``'s ``take_last`` keyword was removed in favor of ``keep``. (:issue:`6511`, :issue:`8505`)

.. _whatsnew_0170.prior_deprecations:

Expand Down
27 changes: 18 additions & 9 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pandas.core import common as com
import pandas.core.nanops as nanops
import pandas.lib as lib
from pandas.util.decorators import Appender, cache_readonly
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg
from pandas.core.strings import StringMethods
from pandas.core.common import AbstractMethodError

Expand Down Expand Up @@ -543,18 +543,23 @@ def _dir_deletions(self):
Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first
keep : {'first', 'last', False}, default 'first'
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
take_last : deprecated
%(inplace)s
Returns
-------
deduplicated : %(klass)s
""")

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs)
def drop_duplicates(self, take_last=False, inplace=False):
duplicated = self.duplicated(take_last=take_last)
def drop_duplicates(self, keep='first', inplace=False):
duplicated = self.duplicated(keep=keep)
result = self[np.logical_not(duplicated)]
if inplace:
return self._update_inplace(result)
Expand All @@ -566,18 +571,22 @@ def drop_duplicates(self, take_last=False, inplace=False):
Parameters
----------
take_last : boolean, default False
Take the last observed index in a group. Default first
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- False : Mark all duplicates as ``True``.
take_last : deprecated
Returns
-------
duplicated : %(duplicated)s
""")

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs)
def duplicated(self, take_last=False):
def duplicated(self, keep='first'):
keys = com._ensure_object(self.values)
duplicated = lib.duplicated(keys, take_last=take_last)
duplicated = lib.duplicated(keys, keep=keep)
try:
return self._constructor(duplicated,
index=self.index).__finalize__(self)
Expand Down
27 changes: 18 additions & 9 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2866,8 +2866,9 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None,
else:
return result

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
def drop_duplicates(self, subset=None, take_last=False, inplace=False):
def drop_duplicates(self, subset=None, keep='first', inplace=False):
"""
Return DataFrame with duplicate rows removed, optionally only
considering certain columns
Expand All @@ -2877,8 +2878,11 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
take_last : boolean, default False
Take the last observed row in a row. Defaults to the first row
keep : {'first', 'last', False}, default 'first'
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
take_last : deprecated
inplace : boolean, default False
Whether to drop duplicates in place or to return a copy
cols : kwargs only argument of subset [deprecated]
Expand All @@ -2887,7 +2891,7 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
-------
deduplicated : DataFrame
"""
duplicated = self.duplicated(subset, take_last=take_last)
duplicated = self.duplicated(subset, keep=keep)

if inplace:
inds, = (-duplicated).nonzero()
Expand All @@ -2896,8 +2900,9 @@ def drop_duplicates(self, subset=None, take_last=False, inplace=False):
else:
return self[-duplicated]

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@deprecate_kwarg(old_arg_name='cols', new_arg_name='subset')
def duplicated(self, subset=None, take_last=False):
def duplicated(self, subset=None, keep='first'):
"""
Return boolean Series denoting duplicate rows, optionally only
considering certain columns
Expand All @@ -2907,9 +2912,13 @@ def duplicated(self, subset=None, take_last=False):
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns
take_last : boolean, default False
For a set of distinct duplicate rows, flag all but the last row as
duplicated. Default is for all but the first row to be flagged
keep : {'first', 'last', False}, default 'first'
- ``first`` : Mark duplicates as ``True`` except for the
first occurrence.
- ``last`` : Mark duplicates as ``True`` except for the
last occurrence.
- False : Mark all duplicates as ``True``.
take_last : deprecated
cols : kwargs only argument of subset [deprecated]
Returns
Expand All @@ -2935,7 +2944,7 @@ def f(vals):
labels, shape = map(list, zip( * map(f, vals)))

ids = get_group_index(labels, shape, sort=False, xnull=False)
return Series(duplicated_int64(ids, take_last), index=self.index)
return Series(duplicated_int64(ids, keep), index=self.index)

#----------------------------------------------------------------------
# Sorting
Expand Down
22 changes: 13 additions & 9 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pandas.lib import Timestamp, Timedelta, is_datetime_array
from pandas.core.base import PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin, _shared_docs, PandasDelegate
from pandas.util.decorators import (Appender, Substitution, cache_readonly,
deprecate)
deprecate, deprecate_kwarg)
import pandas.core.common as com
from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype,
_values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype,
Expand Down Expand Up @@ -2628,13 +2628,15 @@ def drop(self, labels, errors='raise'):
indexer = indexer[~mask]
return self.delete(indexer)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs)
def drop_duplicates(self, take_last=False):
return super(Index, self).drop_duplicates(take_last=take_last)
def drop_duplicates(self, keep='first'):
return super(Index, self).drop_duplicates(keep=keep)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
return super(Index, self).duplicated(take_last=take_last)
def duplicated(self, keep='first'):
return super(Index, self).duplicated(keep=keep)

def _evaluate_with_timedelta_like(self, other, op, opstr):
raise TypeError("can only perform ops with timedelta like values")
Expand Down Expand Up @@ -3065,10 +3067,11 @@ def _engine(self):
def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
def duplicated(self, keep='first'):
from pandas.hashtable import duplicated_int64
return duplicated_int64(self.codes.astype('i8'), take_last)
return duplicated_int64(self.codes.astype('i8'), keep)

def get_loc(self, key, method=None):
"""
Expand Down Expand Up @@ -4228,15 +4231,16 @@ def _has_complex_internals(self):
def is_unique(self):
return not self.duplicated().any()

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
def duplicated(self, keep='first'):
from pandas.core.groupby import get_group_index
from pandas.hashtable import duplicated_int64

shape = map(len, self.levels)
ids = get_group_index(self.labels, shape, sort=False, xnull=False)

return duplicated_int64(ids, take_last)
return duplicated_int64(ids, keep)

def get_value(self, series, key):
# somewhat broken encapsulation
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
import pandas.core.datetools as datetools
import pandas.core.format as fmt
import pandas.core.nanops as nanops
from pandas.util.decorators import Appender, cache_readonly
from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg

import pandas.lib as lib
import pandas.tslib as tslib
Expand Down Expand Up @@ -1155,14 +1155,15 @@ def mode(self):
from pandas.core.algorithms import mode
return mode(self)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs)
def drop_duplicates(self, take_last=False, inplace=False):
return super(Series, self).drop_duplicates(take_last=take_last,
inplace=inplace)
def drop_duplicates(self, keep='first', inplace=False):
return super(Series, self).drop_duplicates(keep=keep, inplace=inplace)

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'})
@Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs)
def duplicated(self, take_last=False):
return super(Series, self).duplicated(take_last=take_last)
def duplicated(self, keep='first'):
return super(Series, self).duplicated(keep=keep)

def idxmin(self, axis=None, out=None, skipna=True):
"""
Expand Down
28 changes: 22 additions & 6 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1026,25 +1026,41 @@ def mode_int64(int64_t[:] values):

@cython.wraparound(False)
@cython.boundscheck(False)
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'):
cdef:
int ret = 0
int ret = 0, value, k
Py_ssize_t i, n = len(values)
kh_int64_t * table = kh_init_int64()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT))

with nogil:
if take_last:
if keep not in ('last', 'first', False):
raise ValueError('keep must be either "first", "last" or False')

if keep == 'last':
with nogil:
for i from n > i >=0:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0
else:
elif keep == 'first':
with nogil:
for i from 0 <= i < n:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0

else:
with nogil:
for i from 0 <= i < n:
value = values[i]
k = kh_get_int64(table, value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_int64(table, value, &ret)
table.keys[k] = value
table.vals[k] = i
out[i] = 0
kh_destroy_int64(table)
return out

Expand Down
26 changes: 19 additions & 7 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1348,35 +1348,47 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):

return result

def duplicated(ndarray[object] values, take_last=False):

def duplicated(ndarray[object] values, object keep='first'):
cdef:
Py_ssize_t i, n
set seen = set()
dict seen = dict()
object row

n = len(values)
cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8)

if take_last:
if keep == 'last':
for i from n > i >= 0:
row = values[i]

if row in seen:
result[i] = 1
else:
seen.add(row)
seen[row] = i
result[i] = 0
else:
elif keep == 'first':
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
else:
seen.add(row)
seen[row] = i
result[i] = 0
elif keep is False:
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
result[seen[row]] = 1
else:
seen[row] = i
result[i] = 0
else:
raise ValueError('keep must be either "first", "last" or False')

return result.view(np.bool_)


def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups):
cdef:
Py_ssize_t i, group_size, n, start
Expand Down
Loading

0 comments on commit 529288e

Please sign in to comment.