Skip to content

Commit

Permalink
DEPR MultiIndex.is_lexsorted and MultiIndex.lexsort_depth (#38701)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Jan 4, 2021
1 parent 9952626 commit 586b490
Show file tree
Hide file tree
Showing 16 changed files with 114 additions and 85 deletions.
1 change: 0 additions & 1 deletion doc/source/reference/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,6 @@ MultiIndex components
MultiIndex.set_codes
MultiIndex.to_flat_index
MultiIndex.to_frame
MultiIndex.is_lexsorted
MultiIndex.sortlevel
MultiIndex.droplevel
MultiIndex.swaplevel
Expand Down
10 changes: 4 additions & 6 deletions doc/source/user_guide/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -658,20 +658,18 @@ Furthermore, if you try to index something that is not fully lexsorted, this can
In [5]: dfm.loc[(0, 'y'):(1, 'z')]
UnsortedIndexError: 'Key length (2) was greater than MultiIndex lexsort depth (1)'
The :meth:`~MultiIndex.is_lexsorted` method on a ``MultiIndex`` shows if the
index is sorted, and the ``lexsort_depth`` property returns the sort depth:
The :meth:`~MultiIndex.is_monotonic_increasing` method on a ``MultiIndex`` shows if the
index is sorted:

.. ipython:: python
dfm.index.is_lexsorted()
dfm.index.lexsort_depth
dfm.index.is_monotonic_increasing
.. ipython:: python
dfm = dfm.sort_index()
dfm
dfm.index.is_lexsorted()
dfm.index.lexsort_depth
dfm.index.is_monotonic_increasing
And now selection works as expected.

Expand Down
38 changes: 28 additions & 10 deletions doc/source/whatsnew/v0.20.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -873,21 +873,27 @@ This is *unchanged* from prior versions, but shown for illustration purposes:
index=pd.MultiIndex.from_product([list('BA'), range(3)]))
df
.. ipython:: python
.. code-block:: python
df.index.is_lexsorted()
df.index.is_monotonic
In [87]: df.index.is_lexsorted()
Out[87]: False
In [88]: df.index.is_monotonic
Out[88]: False
Sorting works as expected

.. ipython:: python
df.sort_index()
.. ipython:: python
.. code-block:: python
In [90]: df.sort_index().index.is_lexsorted()
Out[90]: True
df.sort_index().index.is_lexsorted()
df.sort_index().index.is_monotonic
In [91]: df.sort_index().index.is_monotonic
Out[91]: True
However, this example, which has a non-monotonic 2nd level,
doesn't behave as desired.
Expand Down Expand Up @@ -919,11 +925,23 @@ Previous behavior:
New behavior:

.. ipython:: python
.. code-block:: python
df.sort_index()
df.sort_index().index.is_lexsorted()
df.sort_index().index.is_monotonic
In [94]: df.sort_index()
Out[94]:
value
a aa 2
bb 1
b aa 4
bb 3
[4 rows x 1 columns]
In [95]: df.sort_index().index.is_lexsorted()
Out[95]: True
In [96]: df.sort_index().index.is_monotonic
Out[96]: True
.. _whatsnew_0200.api_breaking.groupby_describe:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Deprecations
- Deprecating allowing scalars passed to the :class:`Categorical` constructor (:issue:`38433`)
- Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`,:issue:`21311`,:issue:`22315`,:issue:`26974`)
- Deprecated ``astype`` of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`)
- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth` as a public methods, users should use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`)
- Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`)
- Deprecated comparison of :class:`Timestamp` object with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`)
-
Expand Down
66 changes: 43 additions & 23 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,11 +391,11 @@ def _verify_integrity(
f"Level values must be unique: {list(level)} on level {i}"
)
if self.sortorder is not None:
if self.sortorder > self._lexsort_depth():
if self.sortorder > _lexsort_depth(self.codes, self.nlevels):
raise ValueError(
"Value for sortorder must be inferior or equal to actual "
f"lexsort_depth: sortorder {self.sortorder} "
f"with lexsort_depth {self._lexsort_depth()}"
f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}"
)

codes = [
Expand Down Expand Up @@ -1809,6 +1809,15 @@ def _is_all_dates(self) -> bool:
return False

def is_lexsorted(self) -> bool:
warnings.warn(
"MultiIndex.is_lexsorted is deprecated as a public function, "
"users should use MultiIndex.is_monotonic_increasing instead.",
FutureWarning,
stacklevel=2,
)
return self._is_lexsorted()

def _is_lexsorted(self) -> bool:
"""
Return True if the codes are lexicographically sorted.
Expand Down Expand Up @@ -1840,15 +1849,19 @@ def is_lexsorted(self) -> bool:
... ['bb', 'aa', 'aa', 'bb']]).is_lexsorted()
False
"""
return self.lexsort_depth == self.nlevels
return self._lexsort_depth == self.nlevels

@cache_readonly
@property
def lexsort_depth(self):
if self.sortorder is not None:
return self.sortorder

return self._lexsort_depth()
warnings.warn(
"MultiIndex.is_lexsorted is deprecated as a public function, "
"users should use MultiIndex.is_monotonic_increasing instead.",
FutureWarning,
stacklevel=2,
)
return self._lexsort_depth

@cache_readonly
def _lexsort_depth(self) -> int:
"""
Compute and return the lexsort_depth, the number of levels of the
Expand All @@ -1858,11 +1871,9 @@ def _lexsort_depth(self) -> int:
-------
int
"""
int64_codes = [ensure_int64(level_codes) for level_codes in self.codes]
for k in range(self.nlevels, 0, -1):
if libalgos.is_lexsorted(int64_codes[:k]):
return k
return 0
if self.sortorder is not None:
return self.sortorder
return _lexsort_depth(self.codes, self.nlevels)

def _sort_levels_monotonic(self):
"""
Expand Down Expand Up @@ -1898,7 +1909,7 @@ def _sort_levels_monotonic(self):
('b', 'bb')],
)
"""
if self.is_lexsorted() and self.is_monotonic:
if self._is_lexsorted() and self.is_monotonic:
return self

new_levels = []
Expand Down Expand Up @@ -2184,7 +2195,7 @@ def drop(self, codes, level=None, errors="raise"):
step = loc.step if loc.step is not None else 1
inds.extend(range(loc.start, loc.stop, step))
elif com.is_bool_indexer(loc):
if self.lexsort_depth == 0:
if self._lexsort_depth == 0:
warnings.warn(
"dropping on a non-lexsorted multi-index "
"without a level parameter may impact performance.",
Expand Down Expand Up @@ -2755,10 +2766,10 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
return super().slice_locs(start, end, step, kind=kind)

def _partial_tup_index(self, tup, side="left"):
if len(tup) > self.lexsort_depth:
if len(tup) > self._lexsort_depth:
raise UnsortedIndexError(
f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth "
f"({self.lexsort_depth})"
f"({self._lexsort_depth})"
)

n = len(tup)
Expand Down Expand Up @@ -2897,7 +2908,7 @@ def _maybe_to_slice(loc):
# break the key into 2 parts based on the lexsort_depth of the index;
# the first part returns a continuous slice of the index; the 2nd part
# needs linear search within the slice
i = self.lexsort_depth
i = self._lexsort_depth
lead_key, follow_key = key[:i], key[i:]
start, stop = (
self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self))
Expand Down Expand Up @@ -3150,7 +3161,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
stop = getattr(stop, "stop", stop)
return convert_indexer(start, stop, step)

elif level > 0 or self.lexsort_depth == 0 or step is not None:
elif level > 0 or self._lexsort_depth == 0 or step is not None:
# need to have like semantics here to right
# searching as when we are using a slice
# so include the stop+1 (so we include stop)
Expand All @@ -3165,7 +3176,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):

idx = self._get_loc_single_level_index(level_index, key)

if level > 0 or self.lexsort_depth == 0:
if level > 0 or self._lexsort_depth == 0:
# Desired level is not sorted
locs = np.array(level_codes == idx, dtype=bool, copy=False)
if not locs.any():
Expand Down Expand Up @@ -3222,10 +3233,10 @@ def get_locs(self, seq):

# must be lexsorted to at least as many levels
true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s]
if true_slices and true_slices[-1] >= self.lexsort_depth:
if true_slices and true_slices[-1] >= self._lexsort_depth:
raise UnsortedIndexError(
"MultiIndex slicing requires the index to be lexsorted: slicing "
f"on levels {true_slices}, lexsort depth {self.lexsort_depth}"
f"on levels {true_slices}, lexsort depth {self._lexsort_depth}"
)
# indexer
# this is the list of all values that we want to select
Expand Down Expand Up @@ -3347,7 +3358,7 @@ def _reorder_indexer(
"""
# If the index is lexsorted and the list_like label in seq are sorted
# then we do not need to sort
if self.is_lexsorted():
if self._is_lexsorted():
need_sort = False
for i, k in enumerate(seq):
if is_list_like(k):
Expand Down Expand Up @@ -3768,6 +3779,15 @@ def isin(self, values, level=None):
__inv__ = make_invalid_op("__inv__")


def _lexsort_depth(codes: List[np.ndarray], nlevels: int) -> int:
"""Count depth (up to a maximum of `nlevels`) with which codes are lexsorted."""
int64_codes = [ensure_int64(level_codes) for level_codes in codes]
for k in range(nlevels, 0, -1):
if libalgos.is_lexsorted(int64_codes[:k]):
return k
return 0


def sparsify_labels(label_list, start: int = 0, sentinel=""):
pivoted = list(zip(*label_list))
k = len(label_list)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ def _convert_level_number(level_num, columns):
roll_columns = roll_columns.swaplevel(lev1, lev2)
this.columns = roll_columns

if not this.columns.is_lexsorted():
if not this.columns._is_lexsorted():
# Workaround the edge case where 0 is one of the column names,
# which interferes with trying to sort based on the first
# level
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def test_drop_multiindex_not_lexsorted(self):
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
)
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns.is_lexsorted()
assert lexsorted_df.columns._is_lexsorted()

# define the non-lexsorted version
not_lexsorted_df = DataFrame(
Expand All @@ -172,7 +172,7 @@ def test_drop_multiindex_not_lexsorted(self):
index="a", columns=["b", "c"], values="d"
)
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns.is_lexsorted()
assert not not_lexsorted_df.columns._is_lexsorted()

# compare the results
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
Expand Down
17 changes: 5 additions & 12 deletions pandas/tests/frame/methods/test_sort_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
),
)
assert df.index.is_lexsorted()
assert df.index._is_lexsorted()
assert not df.index.is_monotonic

# sort it
Expand All @@ -35,15 +35,13 @@ def test_sort_index_and_reconstruction_doc_example(self):
),
)
result = df.sort_index()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

# reconstruct
result = df.sort_index().copy()
result.index = result.index._sort_levels_monotonic()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -524,14 +522,13 @@ def test_sort_index_and_reconstruction(self):
[(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]
),
)
assert expected.index.is_lexsorted()
assert expected.index._is_lexsorted()

result = DataFrame(
[[1, 1], [2, 2], [1, 1], [2, 2]],
index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
)
result = result.sort_index()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)
Expand All @@ -543,14 +540,13 @@ def test_sort_index_and_reconstruction(self):
),
)
result = result.sort_index()
assert result.index.is_lexsorted()
assert result.index._is_lexsorted()

tm.assert_frame_equal(result, expected)

concatted = pd.concat([df, df], keys=[0.8, 0.5])
result = concatted.sort_index()

assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)
Expand All @@ -567,13 +563,10 @@ def test_sort_index_and_reconstruction(self):
df.columns = df.columns.set_levels(
pd.to_datetime(df.columns.levels[1]), level=1
)
assert not df.columns.is_lexsorted()
assert not df.columns.is_monotonic
result = df.sort_index(axis=1)
assert result.columns.is_lexsorted()
assert result.columns.is_monotonic
result = df.sort_index(axis=1, level=1)
assert result.columns.is_lexsorted()
assert result.columns.is_monotonic

# TODO: better name, de-duplicate with test_sort_index_level above
Expand Down Expand Up @@ -614,7 +607,7 @@ def test_sort_index_level_large_cardinality(self):

# it works!
result = df.sort_index(level=0)
assert result.index.lexsort_depth == 3
assert result.index._lexsort_depth == 3

# GH#2684 (int32)
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
Expand All @@ -623,7 +616,7 @@ def test_sort_index_level_large_cardinality(self):
# it works!
result = df.sort_index(level=0)
assert (result.dtypes.values == df.dtypes.values).all()
assert result.index.lexsort_depth == 3
assert result.index._lexsort_depth == 3

def test_sort_index_level_by_name(self):
mi = MultiIndex(
Expand Down

0 comments on commit 586b490

Please sign in to comment.