BUG: unstack with sort=False fails when used with the level parameter… (

#56357) * BUG: unstack with sort=False fails when used with the level parameter (#54987) Assign new codes to labels when sort=False. This is done so that the data appears to be already sorted, fixing the bug. * Minor refactor and cleanup * Cleanup & remove test * whatsnew * Revert test removal --------- Co-authored-by: richard <rhshadrach@gmail.com> Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
pandas-dev · May 21, 2024 · b991274 · b991274
1 parent bdcb192
commit b991274
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 10 deletions.
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -477,7 +477,7 @@ Groupby/resample/rolling
 Reshaping
 ^^^^^^^^^
 - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
--
+- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
 
 Sparse
 ^^^^^^

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -168,6 +168,9 @@ def _indexer_and_to_sort(
         v = self.level
 
         codes = list(self.index.codes)
+        if not self.sort:
+            # Create new codes considering that labels are already sorted
+            codes = [factorize(code)[0] for code in codes]
         levs = list(self.index.levels)
         to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
         sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
@@ -186,12 +189,9 @@ def sorted_labels(self) -> list[np.ndarray]:
         return to_sort
 
     def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
-        if self.sort:
-            indexer, _ = self._indexer_and_to_sort
-
-            sorted_values = algos.take_nd(values, indexer, axis=0)
-            return sorted_values
-        return values
+        indexer, _ = self._indexer_and_to_sort
+        sorted_values = algos.take_nd(values, indexer, axis=0)
+        return sorted_values
 
     def _make_selectors(self) -> None:
         new_levels = self.new_index_levels
@@ -394,7 +394,13 @@ def _repeater(self) -> np.ndarray:
     @cache_readonly
     def new_index(self) -> MultiIndex | Index:
         # Does not depend on values or value_columns
-        result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
+        if self.sort:
+            labels = self.sorted_labels[:-1]
+        else:
+            v = self.level
+            codes = list(self.index.codes)
+            labels = codes[:v] + codes[v + 1 :]
+        result_codes = [lab.take(self.compressor) for lab in labels]
 
         # construct the new index
         if len(self.new_index_levels) == 1:

diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1321,6 +1321,21 @@ def test_unstack_sort_false(frame_or_series, dtype):
         [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")]
     )
     obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype)
+
+    result = obj.unstack(level=0, sort=False)
+
+    if frame_or_series is DataFrame:
+        expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")])
+    else:
+        expected_columns = ["two", "one"]
+    expected = DataFrame(
+        [[1.0, 3.0], [2.0, 4.0]],
+        index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]),
+        columns=expected_columns,
+        dtype=dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
     result = obj.unstack(level=-1, sort=False)
 
     if frame_or_series is DataFrame:

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -2705,14 +2705,13 @@ def test_pivot_table_with_margins_and_numeric_column_names(self):
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("m", [1, 10])
-    def test_unstack_shares_memory(self, m):
+    def test_unstack_copy(self, m):
         # GH#56633
         levels = np.arange(m)
         index = MultiIndex.from_product([levels] * 2)
         values = np.arange(m * m * 100).reshape(m * m, 100)
         df = DataFrame(values, index, np.arange(100))
         df_orig = df.copy()
         result = df.unstack(sort=False)
-        assert np.shares_memory(df._values, result._values) is (m == 1)
         result.iloc[0, 0] = -1
         tm.assert_frame_equal(df, df_orig)