Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: unstack with sort=False fails when used with the level parameter… #56357

Merged
merged 11 commits into from
May 21, 2024
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ Groupby/resample/rolling
Reshaping
^^^^^^^^^
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
-
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)

Sparse
^^^^^^
Expand Down
20 changes: 13 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ def _indexer_and_to_sort(
v = self.level

codes = list(self.index.codes)
if not self.sort:
# Create new codes considering that labels are already sorted
codes = [factorize(code)[0] for code in codes]
levs = list(self.index.levels)
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
Expand All @@ -186,12 +189,9 @@ def sorted_labels(self) -> list[np.ndarray]:
return to_sort

def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
if self.sort:
indexer, _ = self._indexer_and_to_sort

sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
return values
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values

def _make_selectors(self) -> None:
new_levels = self.new_index_levels
Expand Down Expand Up @@ -394,7 +394,13 @@ def _repeater(self) -> np.ndarray:
@cache_readonly
def new_index(self) -> MultiIndex | Index:
# Does not depend on values or value_columns
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
if self.sort:
labels = self.sorted_labels[:-1]
else:
v = self.level
codes = list(self.index.codes)
labels = codes[:v] + codes[v + 1 :]
result_codes = [lab.take(self.compressor) for lab in labels]

# construct the new index
if len(self.new_index_levels) == 1:
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1321,6 +1321,21 @@ def test_unstack_sort_false(frame_or_series, dtype):
[("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")]
)
obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype)

result = obj.unstack(level=0, sort=False)

if frame_or_series is DataFrame:
expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")])
else:
expected_columns = ["two", "one"]
expected = DataFrame(
[[1.0, 3.0], [2.0, 4.0]],
index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]),
columns=expected_columns,
dtype=dtype,
)
tm.assert_frame_equal(result, expected)

result = obj.unstack(level=-1, sort=False)

if frame_or_series is DataFrame:
Expand Down
13 changes: 0 additions & 13 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2703,16 +2703,3 @@ def test_pivot_table_with_margins_and_numeric_column_names(self):
index=Index(["a", "b", "All"], name=0),
)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("m", [1, 10])
def test_unstack_shares_memory(self, m):
Copy link
Member

@rhshadrach rhshadrach May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was added in #57487. To fix this bug, we now need to unconditionally do a take in _make_sorted_values. I don't think the fact that it shared memory in the m=1 case was important - I think that was the only reason this test was added.

cc @phofl

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to keep the test, it still ensures correct behavior. You can remove the shares memory assertion, but everything else should still pass. A failure would imply a legit bug

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing - done. Renamed since shares_memory was no longer accurate.

# GH#56633
levels = np.arange(m)
index = MultiIndex.from_product([levels] * 2)
values = np.arange(m * m * 100).reshape(m * m, 100)
df = DataFrame(values, index, np.arange(100))
df_orig = df.copy()
result = df.unstack(sort=False)
assert np.shares_memory(df._values, result._values) is (m == 1)
result.iloc[0, 0] = -1
tm.assert_frame_equal(df, df_orig)