Skip to content

Commit

Permalink
simplied DataFrame.duplicated a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Mar 3, 2015
1 parent 1ab0e5f commit 7da9178
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 14 deletions.
7 changes: 4 additions & 3 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _unique_generic(values, table_type, type_caster):



def factorize(values, sort=False, order=None, na_sentinel=-1):
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
"""
Encode input values as an enumerated type or categorical variable
Expand All @@ -106,8 +106,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
sort : boolean, default False
Sort by values
order : deprecated
na_sentinel: int, default -1
na_sentinel : int, default -1
Value to mark "not found"
size_hint : hint to the hashtable sizer
Returns
-------
Expand All @@ -129,7 +130,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
is_timedelta = com.is_timedelta64_dtype(vals)
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

table = hash_klass(len(vals))
table = hash_klass(size_hint or len(vals))
uniques = vec_klass()
labels = table.get_labels(vals, uniques, 0, na_sentinel)

Expand Down
16 changes: 5 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2832,18 +2832,12 @@ def duplicated(self, subset=None, take_last=False):
duplicated : Series
"""
from pandas.core.groupby import get_group_index
from pandas.core.algorithms import factorize
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT

size_hint = min(len(self), _SIZE_HINT_LIMIT)

def factorize(vals):
(hash_klass, vec_klass), vals = \
algos._get_data_algo(vals, algos._hashtables)

uniques, table = vec_klass(), hash_klass(size_hint)
labels = table.get_labels(vals, uniques, 0, -1)

return labels.astype('i8', copy=False), len(uniques)
def f(vals):
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
return labels.astype('i8',copy=False), len(shape)

if subset is None:
subset = self.columns
Expand All @@ -2853,7 +2847,7 @@ def factorize(vals):
subset = subset,

vals = (self[col].values for col in subset)
labels, shape = map(list, zip( * map(factorize, vals)))
labels, shape = map(list, zip( * map(f, vals)))

ids = get_group_index(labels, shape, sort=False, xnull=False)
return Series(duplicated_int64(ids, take_last), index=self.index)
Expand Down

0 comments on commit 7da9178

Please sign in to comment.