BUG: fix degenerate MultiIndex sorting (#16092)

xref #15694 closes #15797
pandas-dev · Apr 22, 2017 · c847884 · c847884
1 parent 5b07d02
commit c847884
Show file tree

Hide file tree

Showing 6 changed files with 36 additions and 16 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -870,7 +870,7 @@ DataFrame.sort_index changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
-This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
+This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`, :issue:`15797`)
 
 This is *unchanged* from prior versions, but shown for illustration purposes:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3364,6 +3364,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
                                    axis=baxis,
                                    convert=False, verify=False)
 
+        # reconstruct axis if needed
+        new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
+
         if inplace:
             return self._update_inplace(new_data)
         else:

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -465,6 +465,10 @@ def _update_inplace(self, result, **kwargs):
         # guard when called from IndexOpsMixin
         raise TypeError("Index can't be updated inplace")
 
+    def _sort_levels_monotonic(self):
+        """ compat with MultiIndex """
+        return self
+
     _index_shared_docs['_get_grouper_for_level'] = """
         Get index grouper corresponding to an index level
 

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -650,8 +650,15 @@ def _convert_level_number(level_num, columns):
     drop_cols = []
     for key in unique_groups:
         loc = this.columns.get_loc(key)
-        slice_len = loc.stop - loc.start
+
         # can make more efficient?
+        # we almost always return a slice
+        # but if unsorted can get a boolean
+        # indexer
+        if not isinstance(loc, slice):
+            slice_len = len(loc)
+        else:
+            slice_len = loc.stop - loc.start
 
         if slice_len == 0:
             drop_cols.append(key)

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1773,6 +1773,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
 
         indexer = _ensure_platform_int(indexer)
         new_index = index.take(indexer)
+        new_index = new_index._sort_levels_monotonic()
 
         new_values = self._values.take(indexer)
         result = self._constructor(new_values, index=new_index)

diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -11,7 +11,6 @@
 from pandas.core.index import Index, MultiIndex
 from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp
 
-from pandas.core.common import UnsortedIndexError
 from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
 import pandas.core.common as com
 import pandas.util.testing as tm
@@ -938,7 +937,7 @@ def test_stack_mixed_dtype(self):
         df = df.sort_index(level=1, axis=1)
 
         stacked = df.stack()
-        result = df['foo'].stack()
+        result = df['foo'].stack().sort_index()
         tm.assert_series_equal(stacked['foo'], result, check_names=False)
         self.assertIs(result.name, None)
         self.assertEqual(stacked['bar'].dtype, np.float_)
@@ -2456,11 +2455,11 @@ def test_frame_getitem_not_sorted2(self):
 
         assert df2_original.index.equals(df2.index)
         expected = df2.sort_index()
-        assert not expected.index.is_lexsorted()
+        assert expected.index.is_lexsorted()
         assert expected.index.is_monotonic
 
         result = df2.sort_index(level=0)
-        assert not result.index.is_lexsorted()
+        assert result.index.is_lexsorted()
         assert result.index.is_monotonic
         tm.assert_frame_equal(result, expected)
 
@@ -2536,8 +2535,7 @@ def test_sort_index_and_reconstruction(self):
         concatted = pd.concat([df, df], keys=[0.8, 0.5])
         result = concatted.sort_index()
 
-        # this will be monotonic, but not lexsorted!
-        assert not result.index.is_lexsorted()
+        assert result.index.is_lexsorted()
         assert result.index.is_monotonic
 
         tm.assert_frame_equal(result, expected)
@@ -2576,7 +2574,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
                                  levels=[['a', 'b'], ['aa', 'bb']],
                                  labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
         result = df.sort_index()
-        assert not result.index.is_lexsorted()
+        assert result.index.is_lexsorted()
         assert result.index.is_monotonic
 
         tm.assert_frame_equal(result, expected)
@@ -2618,22 +2616,29 @@ def my_func(group):
     def test_sort_non_lexsorted(self):
         # degenerate case where we sort but don't
         # have a satisfying result :<
-
+        # GH 15797
         idx = MultiIndex([['A', 'B', 'C'],
                           ['c', 'b', 'a']],
                          [[0, 1, 2, 0, 1, 2],
                           [0, 2, 1, 1, 0, 2]])
 
-        df = DataFrame({'col': range(len(idx))}, index=idx)
+        df = DataFrame({'col': range(len(idx))},
+                       index=idx,
+                       dtype='int64')
         assert df.index.is_lexsorted() is False
         assert df.index.is_monotonic is False
 
-        result = df.sort_index()
-        assert result.index.is_lexsorted() is False
-        assert result.index.is_monotonic is True
+        sorted = df.sort_index()
+        assert sorted.index.is_lexsorted() is True
+        assert sorted.index.is_monotonic is True
 
-        with pytest.raises(UnsortedIndexError):
-            result.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
+        expected = DataFrame(
+            {'col': [1, 4, 5, 2]},
+            index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'),
+                                          ('C', 'a'), ('C', 'b')]),
+            dtype='int64')
+        result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
+        tm.assert_frame_equal(result, expected)
 
     def test_sort_index_nan(self):
         # GH 14784