Skip to content

Commit

Permalink
add keep and subset arguments to drop_duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
mrocklin committed Nov 24, 2015
1 parent b276d7a commit 397734b
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 2 deletions.
6 changes: 4 additions & 2 deletions dask/dataframe/core.py
Expand Up @@ -241,8 +241,9 @@ def cache(self, cache=Cache):
return self._constructor(dsk2, name, self.column_info, self.divisions)

@derived_from(pd.DataFrame)
def drop_duplicates(self):
chunk = lambda s: s.drop_duplicates()
def drop_duplicates(self, **kwargs):
assert all(k in ('keep', 'subset', 'take_last') for k in kwargs)
chunk = lambda s: s.drop_duplicates(**kwargs)
return aca(self, chunk=chunk, aggregate=chunk, columns=self.column_info,
token='drop-duplicates')

Expand Down Expand Up @@ -1522,6 +1523,7 @@ def apply(self, func, axis=0, args=(), columns=no_default, **kwds):
False, False, None, args, **kwds)



# bind operators
for op in [operator.abs, operator.add, operator.and_, operator_div,
operator.eq, operator.gt, operator.ge, operator.inv,
Expand Down
19 changes: 19 additions & 0 deletions dask/dataframe/tests/test_dataframe.py
Expand Up @@ -1222,6 +1222,25 @@ def test_drop_duplicates():
assert eq(d.index.drop_duplicates(), full.index.drop_duplicates())


def test_drop_duplicates_subset():
df = pd.DataFrame({'x': [1, 2, 3, 1, 2, 3],
'y': ['a', 'a', 'b', 'b', 'c', 'c']})
ddf = dd.from_pandas(df, npartitions=2)

if pd.__version__ < '0.17':
kwargs = [{'take_last': False}, {'take_last': True}]
else:
kwargs = [{'keep': 'first'}, {'keep': 'last'}]


for kwarg in kwargs:
assert eq(df.x.drop_duplicates(**kwarg),
ddf.x.drop_duplicates(**kwarg))
for ss in [['x'], 'y', ['x', 'y']]:
assert eq(df.drop_duplicates(subset=ss, **kwarg),
ddf.drop_duplicates(subset=ss, **kwarg))


def test_full_groupby():
assert raises(Exception, lambda: d.groupby('does_not_exist'))
assert raises(Exception, lambda: d.groupby('a').does_not_exist)
Expand Down

0 comments on commit 397734b

Please sign in to comment.