Skip to content

Commit

Permalink
BUG: fix degenerate MultiIndex sorting (#16092)
Browse files Browse the repository at this point in the history
xref #15694
closes #15797
  • Loading branch information
jreback committed Apr 22, 2017
1 parent 5b07d02 commit c847884
Show file tree
Hide file tree
Showing 6 changed files with 36 additions and 16 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ DataFrame.sort_index changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`, :issue:`15797`)

This is *unchanged* from prior versions, but shown for illustration purposes:

Expand Down
3 changes: 3 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3364,6 +3364,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
axis=baxis,
convert=False, verify=False)

# reconstruct axis if needed
new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()

if inplace:
return self._update_inplace(new_data)
else:
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,10 @@ def _update_inplace(self, result, **kwargs):
# guard when called from IndexOpsMixin
raise TypeError("Index can't be updated inplace")

def _sort_levels_monotonic(self):
""" compat with MultiIndex """
return self

_index_shared_docs['_get_grouper_for_level'] = """
Get index grouper corresponding to an index level
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,8 +650,15 @@ def _convert_level_number(level_num, columns):
drop_cols = []
for key in unique_groups:
loc = this.columns.get_loc(key)
slice_len = loc.stop - loc.start

# can make more efficient?
# we almost always return a slice
# but if unsorted can get a boolean
# indexer
if not isinstance(loc, slice):
slice_len = len(loc)
else:
slice_len = loc.stop - loc.start

if slice_len == 0:
drop_cols.append(key)
Expand Down
1 change: 1 addition & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1773,6 +1773,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,

indexer = _ensure_platform_int(indexer)
new_index = index.take(indexer)
new_index = new_index._sort_levels_monotonic()

new_values = self._values.take(indexer)
result = self._constructor(new_values, index=new_index)
Expand Down
33 changes: 19 additions & 14 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from pandas.core.index import Index, MultiIndex
from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp

from pandas.core.common import UnsortedIndexError
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
import pandas.core.common as com
import pandas.util.testing as tm
Expand Down Expand Up @@ -938,7 +937,7 @@ def test_stack_mixed_dtype(self):
df = df.sort_index(level=1, axis=1)

stacked = df.stack()
result = df['foo'].stack()
result = df['foo'].stack().sort_index()
tm.assert_series_equal(stacked['foo'], result, check_names=False)
self.assertIs(result.name, None)
self.assertEqual(stacked['bar'].dtype, np.float_)
Expand Down Expand Up @@ -2456,11 +2455,11 @@ def test_frame_getitem_not_sorted2(self):

assert df2_original.index.equals(df2.index)
expected = df2.sort_index()
assert not expected.index.is_lexsorted()
assert expected.index.is_lexsorted()
assert expected.index.is_monotonic

result = df2.sort_index(level=0)
assert not result.index.is_lexsorted()
assert result.index.is_lexsorted()
assert result.index.is_monotonic
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -2536,8 +2535,7 @@ def test_sort_index_and_reconstruction(self):
concatted = pd.concat([df, df], keys=[0.8, 0.5])
result = concatted.sort_index()

# this will be monotonic, but not lexsorted!
assert not result.index.is_lexsorted()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -2576,7 +2574,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
levels=[['a', 'b'], ['aa', 'bb']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
result = df.sort_index()
assert not result.index.is_lexsorted()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -2618,22 +2616,29 @@ def my_func(group):
def test_sort_non_lexsorted(self):
# degenerate case where we sort but don't
# have a satisfying result :<

# GH 15797
idx = MultiIndex([['A', 'B', 'C'],
['c', 'b', 'a']],
[[0, 1, 2, 0, 1, 2],
[0, 2, 1, 1, 0, 2]])

df = DataFrame({'col': range(len(idx))}, index=idx)
df = DataFrame({'col': range(len(idx))},
index=idx,
dtype='int64')
assert df.index.is_lexsorted() is False
assert df.index.is_monotonic is False

result = df.sort_index()
assert result.index.is_lexsorted() is False
assert result.index.is_monotonic is True
sorted = df.sort_index()
assert sorted.index.is_lexsorted() is True
assert sorted.index.is_monotonic is True

with pytest.raises(UnsortedIndexError):
result.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
expected = DataFrame(
{'col': [1, 4, 5, 2]},
index=MultiIndex.from_tuples([('B', 'a'), ('B', 'c'),
('C', 'a'), ('C', 'b')]),
dtype='int64')
result = sorted.loc[pd.IndexSlice['B':'C', 'a':'c'], :]
tm.assert_frame_equal(result, expected)

def test_sort_index_nan(self):
# GH 14784
Expand Down

0 comments on commit c847884

Please sign in to comment.