From 39b8ce3c26250528856aecb323ee96ca075b385b Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 13 Apr 2015 11:30:21 -0400 Subject: [PATCH] Implement nlargest and nsmallest for DataFrames --- doc/source/api.rst | 2 + doc/source/basics.rst | 14 +++++++ doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/frame.py | 73 +++++++++++++++++++++++++++++++++ pandas/tests/test_frame.py | 35 ++++++++++++++++ 5 files changed, 125 insertions(+) diff --git a/doc/source/api.rst b/doc/source/api.rst index a1284a3ff7bc9..1cbe55ddbacb6 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -904,6 +904,8 @@ Reshaping, sorting, transposing DataFrame.sort DataFrame.sort_index DataFrame.sortlevel + DataFrame.nlargest + DataFrame.nsmallest DataFrame.swaplevel DataFrame.stack DataFrame.unstack diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 524f57953d5b8..58374fabaec32 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1497,6 +1497,20 @@ faster than sorting the entire Series and calling ``head(n)`` on the result. s.nsmallest(3) s.nlargest(3) +.. versionadded:: 0.17.0 + +``DataFrame`` also has the ``nlargest`` and ``nsmallest`` methods. + +.. ipython:: python + + df = DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1], + 'b': list('abdceff'), + 'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]}) + df.nlargest(3, 'a') + df.nlargest(5, ['a', 'c']) + df.nsmallest(3, 'a') + df.nsmallest(5, ['a', 'c']) + .. _basics.multi-index_sorting: diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 16c6c639a489e..50a7c3b0c22e9 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -32,6 +32,7 @@ Check the :ref:`API Changes ` and :ref:`deprecations >> df = DataFrame({'a': [1, 10, 8, 11, -1], + ... 'b': list('abdce'), + ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]}) + >>> df.nlargest(3, 'a') + a b c + 3 11 c 3 + 1 10 b 2 + 2 8 d NaN + """ + return self._nsorted(columns, n, 'nlargest', take_last) + + def nsmallest(self, n, columns, take_last=False): + """Get the rows of a DataFrame sorted by the `n` smallest + values of `columns`. + + .. versionadded:: 0.17.0 + + Parameters + ---------- + n : int + Number of items to retrieve + columns : list or str + Column name or names to order by + take_last : bool, optional + Where there are duplicate values, take the last duplicate + + Returns + ------- + DataFrame + + Examples + -------- + >>> df = DataFrame({'a': [1, 10, 8, 11, -1], + ... 'b': list('abdce'), + ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]}) + >>> df.nsmallest(3, 'a') + a b c + 4 -1 e 4 + 0 1 a 1 + 2 8 d NaN + """ + return self._nsorted(columns, n, 'nsmallest', take_last) + def swaplevel(self, i, j, axis=0): """ Swap levels i and j in a MultiIndex on a particular axis diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3b93465c1efe9..77ef5fecf22c9 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -14609,6 +14609,41 @@ def test_dataframe_metadata(self): self.assertEqual(df._metadata, unpickled._metadata) self.assertEqual(df.testattr, unpickled.testattr) + def test_nlargest(self): + # GH10393 + from string import ascii_lowercase + df = pd.DataFrame({'a': np.random.permutation(10), + 'b': list(ascii_lowercase[:10])}) + result = df.nlargest(5, 'a') + expected = df.sort('a', ascending=False).head(5) + tm.assert_frame_equal(result, expected) + + def test_nlargest_multiple_columns(self): + from string import ascii_lowercase + df = pd.DataFrame({'a': np.random.permutation(10), + 'b': list(ascii_lowercase[:10]), + 'c': np.random.permutation(10).astype('float64')}) + result = df.nlargest(5, ['a', 'b']) + expected = df.sort(['a', 'b'], ascending=False).head(5) + tm.assert_frame_equal(result, expected) + + def test_nsmallest(self): + from string import ascii_lowercase + df = pd.DataFrame({'a': np.random.permutation(10), + 'b': list(ascii_lowercase[:10])}) + result = df.nsmallest(5, 'a') + expected = df.sort('a').head(5) + tm.assert_frame_equal(result, expected) + + def test_nsmallest_multiple_columns(self): + from string import ascii_lowercase + df = pd.DataFrame({'a': np.random.permutation(10), + 'b': list(ascii_lowercase[:10]), + 'c': np.random.permutation(10).astype('float64')}) + result = df.nsmallest(5, ['a', 'c']) + expected = df.sort(['a', 'c']).head(5) + tm.assert_frame_equal(result, expected) + def test_to_panel_expanddim(self): # GH 9762