Merge pull request #10393 from cpcloud/df-partial-sort

Add nlargest/nsmallest for DataFrame
pandas-dev · Aug 4, 2015 · 0479a80 · 0479a80
2 parents f1719b7 + 39b8ce3
commit 0479a80
Show file tree

Hide file tree

Showing 5 changed files with 125 additions and 0 deletions.
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -904,6 +904,8 @@ Reshaping, sorting, transposing
    DataFrame.sort
    DataFrame.sort_index
    DataFrame.sortlevel
+   DataFrame.nlargest
+   DataFrame.nsmallest
    DataFrame.swaplevel
    DataFrame.stack
    DataFrame.unstack

diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1497,6 +1497,20 @@ faster than sorting the entire Series and calling ``head(n)`` on the result.
    s.nsmallest(3)
    s.nlargest(3)
 
+.. versionadded:: 0.17.0
+
+``DataFrame`` also has the ``nlargest`` and ``nsmallest`` methods.
+
+.. ipython:: python
+
+   df = DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
+                   'b': list('abdceff'),
+                   'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
+   df.nlargest(3, 'a')
+   df.nlargest(5, ['a', 'c'])
+   df.nsmallest(3, 'a')
+   df.nsmallest(5, ['a', 'c'])
+
 
 .. _basics.multi-index_sorting:
 

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -32,6 +32,7 @@ Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsne
 New features
 ~~~~~~~~~~~~
 
+- ``DataFrame`` has the ``nlargest`` and ``nsmallest`` methods (:issue:`10393`)
 - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
 - Enable writing complex values to HDF stores when using table format (:issue:`10447`)
 - Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3127,6 +3127,79 @@ def sortlevel(self, level=0, axis=0, ascending=True,
         else:
             return self._constructor(new_data).__finalize__(self)
 
+    def _nsorted(self, columns, n, method, take_last):
+        if not com.is_list_like(columns):
+            columns = [columns]
+        columns = list(columns)
+        ser = getattr(self[columns[0]], method)(n, take_last=take_last)
+        ascending = dict(nlargest=False, nsmallest=True)[method]
+        return self.loc[ser.index].sort(columns, ascending=ascending,
+                                        kind='mergesort')
+
+    def nlargest(self, n, columns, take_last=False):
+        """Get the rows of a DataFrame sorted by the `n` largest
+        values of `columns`.
+
+        .. versionadded:: 0.17.0
+
+        Parameters
+        ----------
+        n : int
+            Number of items to retrieve
+        columns : list or str
+            Column name or names to order by
+        take_last : bool, optional
+            Where there are duplicate values, take the last duplicate
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> df = DataFrame({'a': [1, 10, 8, 11, -1],
+        ...                 'b': list('abdce'),
+        ...                 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
+        >>> df.nlargest(3, 'a')
+            a  b   c
+        3  11  c   3
+        1  10  b   2
+        2   8  d NaN
+        """
+        return self._nsorted(columns, n, 'nlargest', take_last)
+
+    def nsmallest(self, n, columns, take_last=False):
+        """Get the rows of a DataFrame sorted by the `n` smallest
+        values of `columns`.
+
+        .. versionadded:: 0.17.0
+
+        Parameters
+        ----------
+        n : int
+            Number of items to retrieve
+        columns : list or str
+            Column name or names to order by
+        take_last : bool, optional
+            Where there are duplicate values, take the last duplicate
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> df = DataFrame({'a': [1, 10, 8, 11, -1],
+        ...                 'b': list('abdce'),
+        ...                 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
+        >>> df.nsmallest(3, 'a')
+           a  b   c
+        4 -1  e   4
+        0  1  a   1
+        2  8  d NaN
+        """
+        return self._nsorted(columns, n, 'nsmallest', take_last)
+
     def swaplevel(self, i, j, axis=0):
         """
         Swap levels i and j in a MultiIndex on a particular axis

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -14609,6 +14609,41 @@ def test_dataframe_metadata(self):
         self.assertEqual(df._metadata, unpickled._metadata)
         self.assertEqual(df.testattr, unpickled.testattr)
 
+    def test_nlargest(self):
+        # GH10393
+        from string import ascii_lowercase
+        df = pd.DataFrame({'a': np.random.permutation(10),
+                           'b': list(ascii_lowercase[:10])})
+        result = df.nlargest(5, 'a')
+        expected = df.sort('a', ascending=False).head(5)
+        tm.assert_frame_equal(result, expected)
+
+    def test_nlargest_multiple_columns(self):
+        from string import ascii_lowercase
+        df = pd.DataFrame({'a': np.random.permutation(10),
+                           'b': list(ascii_lowercase[:10]),
+                           'c': np.random.permutation(10).astype('float64')})
+        result = df.nlargest(5, ['a', 'b'])
+        expected = df.sort(['a', 'b'], ascending=False).head(5)
+        tm.assert_frame_equal(result, expected)
+
+    def test_nsmallest(self):
+        from string import ascii_lowercase
+        df = pd.DataFrame({'a': np.random.permutation(10),
+                           'b': list(ascii_lowercase[:10])})
+        result = df.nsmallest(5, 'a')
+        expected = df.sort('a').head(5)
+        tm.assert_frame_equal(result, expected)
+
+    def test_nsmallest_multiple_columns(self):
+        from string import ascii_lowercase
+        df = pd.DataFrame({'a': np.random.permutation(10),
+                           'b': list(ascii_lowercase[:10]),
+                           'c': np.random.permutation(10).astype('float64')})
+        result = df.nsmallest(5, ['a', 'c'])
+        expected = df.sort(['a', 'c']).head(5)
+        tm.assert_frame_equal(result, expected)
+
     def test_to_panel_expanddim(self):
         # GH 9762