Skip to content

Commit

Permalink
performance improvement in DataFrame.duplicated
Browse files Browse the repository at this point in the history
  • Loading branch information
behzadnouri authored and jreback committed Mar 3, 2015
1 parent b1aeb77 commit 1ab0e5f
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 24 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ Performance
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
- Performance improvements in multi-key ``groupby`` (:issue:`9429`)
- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
- Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`)
- Cythonized ``Period`` (:issue:`9440`)

Bug Fixes
Expand Down
44 changes: 23 additions & 21 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2831,30 +2831,32 @@ def duplicated(self, subset=None, take_last=False):
-------
duplicated : Series
"""
# kludge for #1833
def _m8_to_i8(x):
if issubclass(x.dtype.type, np.datetime64):
return x.view(np.int64)
return x
from pandas.core.groupby import get_group_index
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT

size_hint = min(len(self), _SIZE_HINT_LIMIT)

def factorize(vals):
(hash_klass, vec_klass), vals = \
algos._get_data_algo(vals, algos._hashtables)

uniques, table = vec_klass(), hash_klass(size_hint)
labels = table.get_labels(vals, uniques, 0, -1)

return labels.astype('i8', copy=False), len(uniques)

# if we are only duplicating on Categoricals this can be much faster
if subset is None:
values = list(_m8_to_i8(self.get_values().T))
else:
if np.iterable(subset) and not isinstance(subset, compat.string_types):
if isinstance(subset, tuple):
if subset in self.columns:
values = [self[subset].get_values()]
else:
values = [_m8_to_i8(self[x].get_values()) for x in subset]
else:
values = [_m8_to_i8(self[x].get_values()) for x in subset]
else:
values = [self[subset].get_values()]
subset = self.columns
elif not np.iterable(subset) or \
isinstance(subset, compat.string_types) or \
isinstance(subset, tuple) and subset in self.columns:
subset = subset,

vals = (self[col].values for col in subset)
labels, shape = map(list, zip( * map(factorize, vals)))

keys = lib.fast_zip_fillna(values)
duplicated = lib.duplicated(keys, take_last=take_last)
return Series(duplicated, index=self.index)
ids = get_group_index(labels, shape, sort=False, xnull=False)
return Series(duplicated_int64(ids, take_last), index=self.index)

#----------------------------------------------------------------------
# Sorting
Expand Down
6 changes: 3 additions & 3 deletions pandas/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1278,7 +1278,7 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
def duplicated(ndarray[object] values, take_last=False):
cdef:
Py_ssize_t i, n
dict seen = {}
set seen = set()
object row

n = len(values)
Expand All @@ -1291,15 +1291,15 @@ def duplicated(ndarray[object] values, take_last=False):
if row in seen:
result[i] = 1
else:
seen[row] = None
seen.add(row)
result[i] = 0
else:
for i from 0 <= i < n:
row = values[i]
if row in seen:
result[i] = 1
else:
seen[row] = None
seen.add(row)
result[i] = 0

return result.view(np.bool_)
Expand Down
14 changes: 14 additions & 0 deletions vb_suite/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,3 +506,17 @@ def get_data(n=100000):
setup,
name='frame_from_records_generator_nrows',
start_date=datetime(2013,10,04)) # issue-4911

setup = common_setup + '''
n = 1 << 20
t = date_range('2015-01-01', freq='S', periods=n // 64)
xs = np.random.randn(n // 64).round(2)
df = DataFrame({'a':np.random.randint(- 1 << 8, 1 << 8, n),
'b':np.random.choice(t, n),
'c':np.random.choice(xs, n)})
'''

frame_duplicated = Benchmark('df.duplicated()', setup,
name='frame_duplicated')

0 comments on commit 1ab0e5f

Please sign in to comment.