DEPR MultiIndex.is_lexsorted and MultiIndex.lexsort_depth (#38701)

pandas-dev · Jan 4, 2021 · 586b490 · 586b490
1 parent 9952626
commit 586b490
Show file tree

Hide file tree

Showing 16 changed files with 114 additions and 85 deletions.
diff --git a/doc/source/reference/indexing.rst b/doc/source/reference/indexing.rst
@@ -301,7 +301,6 @@ MultiIndex components
    MultiIndex.set_codes
    MultiIndex.to_flat_index
    MultiIndex.to_frame
-   MultiIndex.is_lexsorted
    MultiIndex.sortlevel
    MultiIndex.droplevel
    MultiIndex.swaplevel

diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst
@@ -658,20 +658,18 @@ Furthermore, if you try to index something that is not fully lexsorted, this can
     In [5]: dfm.loc[(0, 'y'):(1, 'z')]
     UnsortedIndexError: 'Key length (2) was greater than MultiIndex lexsort depth (1)'
 
-The :meth:`~MultiIndex.is_lexsorted` method on a ``MultiIndex`` shows if the
-index is sorted, and the ``lexsort_depth`` property returns the sort depth:
+The :meth:`~MultiIndex.is_monotonic_increasing` method on a ``MultiIndex`` shows if the
+index is sorted:
 
 .. ipython:: python
 
-   dfm.index.is_lexsorted()
-   dfm.index.lexsort_depth
+   dfm.index.is_monotonic_increasing
 
 .. ipython:: python
 
    dfm = dfm.sort_index()
    dfm
-   dfm.index.is_lexsorted()
-   dfm.index.lexsort_depth
+   dfm.index.is_monotonic_increasing
 
 And now selection works as expected.
 

diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst
@@ -873,21 +873,27 @@ This is *unchanged* from prior versions, but shown for illustration purposes:
                      index=pd.MultiIndex.from_product([list('BA'), range(3)]))
    df
 
-.. ipython:: python
+.. code-block:: python
 
-   df.index.is_lexsorted()
-   df.index.is_monotonic
+   In [87]: df.index.is_lexsorted()
+   Out[87]: False
+
+   In [88]: df.index.is_monotonic
+   Out[88]: False
 
 Sorting works as expected
 
 .. ipython:: python
 
    df.sort_index()
 
-.. ipython:: python
+.. code-block:: python
+
+   In [90]: df.sort_index().index.is_lexsorted()
+   Out[90]: True
 
-   df.sort_index().index.is_lexsorted()
-   df.sort_index().index.is_monotonic
+   In [91]: df.sort_index().index.is_monotonic
+   Out[91]: True
 
 However, this example, which has a non-monotonic 2nd level,
 doesn't behave as desired.
@@ -919,11 +925,23 @@ Previous behavior:
 
 New behavior:
 
-.. ipython:: python
+.. code-block:: python
 
-   df.sort_index()
-   df.sort_index().index.is_lexsorted()
-   df.sort_index().index.is_monotonic
+   In [94]: df.sort_index()
+   Out[94]:
+         value
+   a aa      2
+     bb      1
+   b aa      4
+     bb      3
+
+   [4 rows x 1 columns]
+
+   In [95]: df.sort_index().index.is_lexsorted()
+   Out[95]: True
+
+   In [96]: df.sort_index().index.is_monotonic
+   Out[96]: True
 
 
 .. _whatsnew_0200.api_breaking.groupby_describe:

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -156,6 +156,7 @@ Deprecations
 - Deprecating allowing scalars passed to the :class:`Categorical` constructor (:issue:`38433`)
 - Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`,:issue:`21311`,:issue:`22315`,:issue:`26974`)
 - Deprecated ``astype`` of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`)
+- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth` as a public methods, users should use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`)
 - Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`)
 - Deprecated comparison of :class:`Timestamp` object with ``datetime.date`` objects.  Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`)
 -

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -391,11 +391,11 @@ def _verify_integrity(
                     f"Level values must be unique: {list(level)} on level {i}"
                 )
         if self.sortorder is not None:
-            if self.sortorder > self._lexsort_depth():
+            if self.sortorder > _lexsort_depth(self.codes, self.nlevels):
                 raise ValueError(
                     "Value for sortorder must be inferior or equal to actual "
                     f"lexsort_depth: sortorder {self.sortorder} "
-                    f"with lexsort_depth {self._lexsort_depth()}"
+                    f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}"
                 )
 
         codes = [
@@ -1809,6 +1809,15 @@ def _is_all_dates(self) -> bool:
         return False
 
     def is_lexsorted(self) -> bool:
+        warnings.warn(
+            "MultiIndex.is_lexsorted is deprecated as a public function, "
+            "users should use MultiIndex.is_monotonic_increasing instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        return self._is_lexsorted()
+
+    def _is_lexsorted(self) -> bool:
         """
         Return True if the codes are lexicographically sorted.
 
@@ -1840,15 +1849,19 @@ def is_lexsorted(self) -> bool:
         ...                            ['bb', 'aa', 'aa', 'bb']]).is_lexsorted()
         False
         """
-        return self.lexsort_depth == self.nlevels
+        return self._lexsort_depth == self.nlevels
 
-    @cache_readonly
+    @property
     def lexsort_depth(self):
-        if self.sortorder is not None:
-            return self.sortorder
-
-        return self._lexsort_depth()
+        warnings.warn(
+            "MultiIndex.is_lexsorted is deprecated as a public function, "
+            "users should use MultiIndex.is_monotonic_increasing instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+        return self._lexsort_depth
 
+    @cache_readonly
     def _lexsort_depth(self) -> int:
         """
         Compute and return the lexsort_depth, the number of levels of the
@@ -1858,11 +1871,9 @@ def _lexsort_depth(self) -> int:
         -------
         int
         """
-        int64_codes = [ensure_int64(level_codes) for level_codes in self.codes]
-        for k in range(self.nlevels, 0, -1):
-            if libalgos.is_lexsorted(int64_codes[:k]):
-                return k
-        return 0
+        if self.sortorder is not None:
+            return self.sortorder
+        return _lexsort_depth(self.codes, self.nlevels)
 
     def _sort_levels_monotonic(self):
         """
@@ -1898,7 +1909,7 @@ def _sort_levels_monotonic(self):
                     ('b', 'bb')],
                    )
         """
-        if self.is_lexsorted() and self.is_monotonic:
+        if self._is_lexsorted() and self.is_monotonic:
             return self
 
         new_levels = []
@@ -2184,7 +2195,7 @@ def drop(self, codes, level=None, errors="raise"):
                     step = loc.step if loc.step is not None else 1
                     inds.extend(range(loc.start, loc.stop, step))
                 elif com.is_bool_indexer(loc):
-                    if self.lexsort_depth == 0:
+                    if self._lexsort_depth == 0:
                         warnings.warn(
                             "dropping on a non-lexsorted multi-index "
                             "without a level parameter may impact performance.",
@@ -2755,10 +2766,10 @@ def slice_locs(self, start=None, end=None, step=None, kind=None):
         return super().slice_locs(start, end, step, kind=kind)
 
     def _partial_tup_index(self, tup, side="left"):
-        if len(tup) > self.lexsort_depth:
+        if len(tup) > self._lexsort_depth:
             raise UnsortedIndexError(
                 f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth "
-                f"({self.lexsort_depth})"
+                f"({self._lexsort_depth})"
             )
 
         n = len(tup)
@@ -2897,7 +2908,7 @@ def _maybe_to_slice(loc):
         # break the key into 2 parts based on the lexsort_depth of the index;
         # the first part returns a continuous slice of the index; the 2nd part
         # needs linear search within the slice
-        i = self.lexsort_depth
+        i = self._lexsort_depth
         lead_key, follow_key = key[:i], key[i:]
         start, stop = (
             self.slice_locs(lead_key, lead_key) if lead_key else (0, len(self))
@@ -3150,7 +3161,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
                 stop = getattr(stop, "stop", stop)
                 return convert_indexer(start, stop, step)
 
-            elif level > 0 or self.lexsort_depth == 0 or step is not None:
+            elif level > 0 or self._lexsort_depth == 0 or step is not None:
                 # need to have like semantics here to right
                 # searching as when we are using a slice
                 # so include the stop+1 (so we include stop)
@@ -3165,7 +3176,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
 
             idx = self._get_loc_single_level_index(level_index, key)
 
-            if level > 0 or self.lexsort_depth == 0:
+            if level > 0 or self._lexsort_depth == 0:
                 # Desired level is not sorted
                 locs = np.array(level_codes == idx, dtype=bool, copy=False)
                 if not locs.any():
@@ -3222,10 +3233,10 @@ def get_locs(self, seq):
 
         # must be lexsorted to at least as many levels
         true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s]
-        if true_slices and true_slices[-1] >= self.lexsort_depth:
+        if true_slices and true_slices[-1] >= self._lexsort_depth:
             raise UnsortedIndexError(
                 "MultiIndex slicing requires the index to be lexsorted: slicing "
-                f"on levels {true_slices}, lexsort depth {self.lexsort_depth}"
+                f"on levels {true_slices}, lexsort depth {self._lexsort_depth}"
             )
         # indexer
         # this is the list of all values that we want to select
@@ -3347,7 +3358,7 @@ def _reorder_indexer(
         """
         # If the index is lexsorted and the list_like label in seq are sorted
         # then we do not need to sort
-        if self.is_lexsorted():
+        if self._is_lexsorted():
             need_sort = False
             for i, k in enumerate(seq):
                 if is_list_like(k):
@@ -3768,6 +3779,15 @@ def isin(self, values, level=None):
     __inv__ = make_invalid_op("__inv__")
 
 
+def _lexsort_depth(codes: List[np.ndarray], nlevels: int) -> int:
+    """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted."""
+    int64_codes = [ensure_int64(level_codes) for level_codes in codes]
+    for k in range(nlevels, 0, -1):
+        if libalgos.is_lexsorted(int64_codes[:k]):
+            return k
+    return 0
+
+
 def sparsify_labels(label_list, start: int = 0, sentinel=""):
     pivoted = list(zip(*label_list))
     k = len(label_list)

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -617,7 +617,7 @@ def _convert_level_number(level_num, columns):
             roll_columns = roll_columns.swaplevel(lev1, lev2)
         this.columns = roll_columns
 
-    if not this.columns.is_lexsorted():
+    if not this.columns._is_lexsorted():
         # Workaround the edge case where 0 is one of the column names,
         # which interferes with trying to sort based on the first
         # level

diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py
@@ -162,7 +162,7 @@ def test_drop_multiindex_not_lexsorted(self):
             [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
         )
         lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
-        assert lexsorted_df.columns.is_lexsorted()
+        assert lexsorted_df.columns._is_lexsorted()
 
         # define the non-lexsorted version
         not_lexsorted_df = DataFrame(
@@ -172,7 +172,7 @@ def test_drop_multiindex_not_lexsorted(self):
             index="a", columns=["b", "c"], values="d"
         )
         not_lexsorted_df = not_lexsorted_df.reset_index()
-        assert not not_lexsorted_df.columns.is_lexsorted()
+        assert not not_lexsorted_df.columns._is_lexsorted()
 
         # compare the results
         tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py
@@ -24,7 +24,7 @@ def test_sort_index_and_reconstruction_doc_example(self):
                 levels=[["a", "b"], ["bb", "aa"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
             ),
         )
-        assert df.index.is_lexsorted()
+        assert df.index._is_lexsorted()
         assert not df.index.is_monotonic
 
         # sort it
@@ -35,15 +35,13 @@ def test_sort_index_and_reconstruction_doc_example(self):
             ),
         )
         result = df.sort_index()
-        assert result.index.is_lexsorted()
         assert result.index.is_monotonic
 
         tm.assert_frame_equal(result, expected)
 
         # reconstruct
         result = df.sort_index().copy()
         result.index = result.index._sort_levels_monotonic()
-        assert result.index.is_lexsorted()
         assert result.index.is_monotonic
 
         tm.assert_frame_equal(result, expected)
@@ -524,14 +522,13 @@ def test_sort_index_and_reconstruction(self):
                 [(0.5, "a"), (0.5, "b"), (0.8, "a"), (0.8, "b")]
             ),
         )
-        assert expected.index.is_lexsorted()
+        assert expected.index._is_lexsorted()
 
         result = DataFrame(
             [[1, 1], [2, 2], [1, 1], [2, 2]],
             index=MultiIndex.from_product([[0.5, 0.8], list("ab")]),
         )
         result = result.sort_index()
-        assert result.index.is_lexsorted()
         assert result.index.is_monotonic
 
         tm.assert_frame_equal(result, expected)
@@ -543,14 +540,13 @@ def test_sort_index_and_reconstruction(self):
             ),
         )
         result = result.sort_index()
-        assert result.index.is_lexsorted()
+        assert result.index._is_lexsorted()
 
         tm.assert_frame_equal(result, expected)
 
         concatted = pd.concat([df, df], keys=[0.8, 0.5])
         result = concatted.sort_index()
 
-        assert result.index.is_lexsorted()
         assert result.index.is_monotonic
 
         tm.assert_frame_equal(result, expected)
@@ -567,13 +563,10 @@ def test_sort_index_and_reconstruction(self):
         df.columns = df.columns.set_levels(
             pd.to_datetime(df.columns.levels[1]), level=1
         )
-        assert not df.columns.is_lexsorted()
         assert not df.columns.is_monotonic
         result = df.sort_index(axis=1)
-        assert result.columns.is_lexsorted()
         assert result.columns.is_monotonic
         result = df.sort_index(axis=1, level=1)
-        assert result.columns.is_lexsorted()
         assert result.columns.is_monotonic
 
     # TODO: better name, de-duplicate with test_sort_index_level above
@@ -614,7 +607,7 @@ def test_sort_index_level_large_cardinality(self):
 
         # it works!
         result = df.sort_index(level=0)
-        assert result.index.lexsort_depth == 3
+        assert result.index._lexsort_depth == 3
 
         # GH#2684 (int32)
         index = MultiIndex.from_arrays([np.arange(4000)] * 3)
@@ -623,7 +616,7 @@ def test_sort_index_level_large_cardinality(self):
         # it works!
         result = df.sort_index(level=0)
         assert (result.dtypes.values == df.dtypes.values).all()
-        assert result.index.lexsort_depth == 3
+        assert result.index._lexsort_depth == 3
 
     def test_sort_index_level_by_name(self):
         mi = MultiIndex(