Skip to content

Commit

Permalink
Merge pull request #10393 from cpcloud/df-partial-sort
Browse files Browse the repository at this point in the history
Add nlargest/nsmallest for DataFrame
  • Loading branch information
jreback committed Aug 4, 2015
2 parents f1719b7 + 39b8ce3 commit 0479a80
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 0 deletions.
2 changes: 2 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,8 @@ Reshaping, sorting, transposing
DataFrame.sort
DataFrame.sort_index
DataFrame.sortlevel
DataFrame.nlargest
DataFrame.nsmallest
DataFrame.swaplevel
DataFrame.stack
DataFrame.unstack
Expand Down
14 changes: 14 additions & 0 deletions doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1497,6 +1497,20 @@ faster than sorting the entire Series and calling ``head(n)`` on the result.
s.nsmallest(3)
s.nlargest(3)
.. versionadded:: 0.17.0

``DataFrame`` also has the ``nlargest`` and ``nsmallest`` methods.

.. ipython:: python
df = DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
'b': list('abdceff'),
'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
df.nlargest(3, 'a')
df.nlargest(5, ['a', 'c'])
df.nsmallest(3, 'a')
df.nsmallest(5, ['a', 'c'])
.. _basics.multi-index_sorting:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsne
New features
~~~~~~~~~~~~

- ``DataFrame`` has the ``nlargest`` and ``nsmallest`` methods (:issue:`10393`)
- SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
- Enable writing complex values to HDF stores when using table format (:issue:`10447`)
- Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)
Expand Down
73 changes: 73 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3127,6 +3127,79 @@ def sortlevel(self, level=0, axis=0, ascending=True,
else:
return self._constructor(new_data).__finalize__(self)

def _nsorted(self, columns, n, method, take_last):
if not com.is_list_like(columns):
columns = [columns]
columns = list(columns)
ser = getattr(self[columns[0]], method)(n, take_last=take_last)
ascending = dict(nlargest=False, nsmallest=True)[method]
return self.loc[ser.index].sort(columns, ascending=ascending,
kind='mergesort')

def nlargest(self, n, columns, take_last=False):
"""Get the rows of a DataFrame sorted by the `n` largest
values of `columns`.
.. versionadded:: 0.17.0
Parameters
----------
n : int
Number of items to retrieve
columns : list or str
Column name or names to order by
take_last : bool, optional
Where there are duplicate values, take the last duplicate
Returns
-------
DataFrame
Examples
--------
>>> df = DataFrame({'a': [1, 10, 8, 11, -1],
... 'b': list('abdce'),
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
>>> df.nlargest(3, 'a')
a b c
3 11 c 3
1 10 b 2
2 8 d NaN
"""
return self._nsorted(columns, n, 'nlargest', take_last)

def nsmallest(self, n, columns, take_last=False):
"""Get the rows of a DataFrame sorted by the `n` smallest
values of `columns`.
.. versionadded:: 0.17.0
Parameters
----------
n : int
Number of items to retrieve
columns : list or str
Column name or names to order by
take_last : bool, optional
Where there are duplicate values, take the last duplicate
Returns
-------
DataFrame
Examples
--------
>>> df = DataFrame({'a': [1, 10, 8, 11, -1],
... 'b': list('abdce'),
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
>>> df.nsmallest(3, 'a')
a b c
4 -1 e 4
0 1 a 1
2 8 d NaN
"""
return self._nsorted(columns, n, 'nsmallest', take_last)

def swaplevel(self, i, j, axis=0):
"""
Swap levels i and j in a MultiIndex on a particular axis
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -14609,6 +14609,41 @@ def test_dataframe_metadata(self):
self.assertEqual(df._metadata, unpickled._metadata)
self.assertEqual(df.testattr, unpickled.testattr)

def test_nlargest(self):
# GH10393
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10])})
result = df.nlargest(5, 'a')
expected = df.sort('a', ascending=False).head(5)
tm.assert_frame_equal(result, expected)

def test_nlargest_multiple_columns(self):
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10]),
'c': np.random.permutation(10).astype('float64')})
result = df.nlargest(5, ['a', 'b'])
expected = df.sort(['a', 'b'], ascending=False).head(5)
tm.assert_frame_equal(result, expected)

def test_nsmallest(self):
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10])})
result = df.nsmallest(5, 'a')
expected = df.sort('a').head(5)
tm.assert_frame_equal(result, expected)

def test_nsmallest_multiple_columns(self):
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10]),
'c': np.random.permutation(10).astype('float64')})
result = df.nsmallest(5, ['a', 'c'])
expected = df.sort(['a', 'c']).head(5)
tm.assert_frame_equal(result, expected)

def test_to_panel_expanddim(self):
# GH 9762

Expand Down

0 comments on commit 0479a80

Please sign in to comment.