Skip to content

Commit

Permalink
BUG: unstack with sort=False fails when used with the level parameter… (
Browse files Browse the repository at this point in the history
#56357)

* BUG: unstack with sort=False fails when used with the level parameter (#54987)

Assign new codes to labels when sort=False. This is done so that the data appears to be already sorted,
fixing the bug.

* Minor refactor and cleanup

* Cleanup & remove test

* whatsnew

* Revert test removal

---------

Co-authored-by: richard <rhshadrach@gmail.com>
Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
  • Loading branch information
4 people committed May 21, 2024
1 parent bdcb192 commit b991274
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 10 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ Groupby/resample/rolling
Reshaping
^^^^^^^^^
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
-
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)

Sparse
^^^^^^
Expand Down
20 changes: 13 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,9 @@ def _indexer_and_to_sort(
v = self.level

codes = list(self.index.codes)
if not self.sort:
# Create new codes considering that labels are already sorted
codes = [factorize(code)[0] for code in codes]
levs = list(self.index.levels)
to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
Expand All @@ -186,12 +189,9 @@ def sorted_labels(self) -> list[np.ndarray]:
return to_sort

def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
if self.sort:
indexer, _ = self._indexer_and_to_sort

sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values
return values
indexer, _ = self._indexer_and_to_sort
sorted_values = algos.take_nd(values, indexer, axis=0)
return sorted_values

def _make_selectors(self) -> None:
new_levels = self.new_index_levels
Expand Down Expand Up @@ -394,7 +394,13 @@ def _repeater(self) -> np.ndarray:
@cache_readonly
def new_index(self) -> MultiIndex | Index:
# Does not depend on values or value_columns
result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
if self.sort:
labels = self.sorted_labels[:-1]
else:
v = self.level
codes = list(self.index.codes)
labels = codes[:v] + codes[v + 1 :]
result_codes = [lab.take(self.compressor) for lab in labels]

# construct the new index
if len(self.new_index_levels) == 1:
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1321,6 +1321,21 @@ def test_unstack_sort_false(frame_or_series, dtype):
[("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")]
)
obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype)

result = obj.unstack(level=0, sort=False)

if frame_or_series is DataFrame:
expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")])
else:
expected_columns = ["two", "one"]
expected = DataFrame(
[[1.0, 3.0], [2.0, 4.0]],
index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]),
columns=expected_columns,
dtype=dtype,
)
tm.assert_frame_equal(result, expected)

result = obj.unstack(level=-1, sort=False)

if frame_or_series is DataFrame:
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2705,14 +2705,13 @@ def test_pivot_table_with_margins_and_numeric_column_names(self):
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("m", [1, 10])
def test_unstack_shares_memory(self, m):
def test_unstack_copy(self, m):
# GH#56633
levels = np.arange(m)
index = MultiIndex.from_product([levels] * 2)
values = np.arange(m * m * 100).reshape(m * m, 100)
df = DataFrame(values, index, np.arange(100))
df_orig = df.copy()
result = df.unstack(sort=False)
assert np.shares_memory(df._values, result._values) is (m == 1)
result.iloc[0, 0] = -1
tm.assert_frame_equal(df, df_orig)

0 comments on commit b991274

Please sign in to comment.