Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ Other enhancements
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`)
- :meth:`DataFrame.unstack` and :meth:`Series.unstack` now support a ``no_fill`` parameter that raises a ``ValueError`` if any missing values would need to be filled during the unstack operation, allowing users to enforce data integrity when a complete 1:1 mapping between stacked and unstacked representations is expected (:issue:`62704`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
Expand Down
20 changes: 18 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10320,7 +10320,11 @@ def explode(
return result.__finalize__(self, method="explode")

def unstack(
self, level: IndexLabel = -1, fill_value=None, sort: bool = True
self,
level: IndexLabel = -1,
fill_value=None,
sort: bool = True,
no_fill: bool = False,
) -> DataFrame | Series:
"""
Pivot a level of the (necessarily hierarchical) index labels.
Expand All @@ -10339,13 +10343,25 @@ def unstack(
Replace NaN with this value if the unstack produces missing values.
sort : bool, default True
Sort the level(s) in the resulting MultiIndex columns.
no_fill : bool, default False
If True, raise a ValueError if any missing values would need to be filled.
This is useful to ensure data integrity when you expect a complete
1:1 mapping between stacked and unstacked representations.

.. versionadded:: 3.0.0

Returns
-------
Series or DataFrame
If index is a MultiIndex: DataFrame with pivoted index labels as new
inner-most level column labels, else Series.

Raises
------
ValueError
If `no_fill` is True and the unstacking operation would require filling
missing values.

See Also
--------
DataFrame.pivot : Pivot a table based on column values.
Expand Down Expand Up @@ -10389,7 +10405,7 @@ def unstack(
"""
from pandas.core.reshape.reshape import unstack

result = unstack(self, level, fill_value, sort)
result = unstack(self, level, fill_value, sort, no_fill)

return result.__finalize__(self, method="unstack")

Expand Down
89 changes: 73 additions & 16 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,16 @@ class _Unstacker:
"""

def __init__(
self, index: MultiIndex, level: Level, constructor, sort: bool = True
self,
index: MultiIndex,
level: Level,
constructor,
sort: bool = True,
no_fill: bool = False,
) -> None:
self.constructor = constructor
self.sort = sort
self.no_fill = no_fill

self.index = index.remove_unused_levels()

Expand Down Expand Up @@ -290,6 +296,29 @@ def get_new_values(self, values, fill_value=None):
mask = self.mask
mask_all = self.mask_all

if self.no_fill and not mask_all:
missing_positions = np.where(~mask)[0]
if len(missing_positions) > 0:
first_missing = missing_positions[0]
row_idx = first_missing // width
col_idx = first_missing % width

index_label = (
self.new_index[row_idx]
if row_idx < len(self.new_index)
else row_idx
)
col_label = (
self.removed_level[col_idx]
if col_idx < len(self.removed_level)
else col_idx
)

raise ValueError(
f"Cannot unstack with no_fill=True because filling is required. "
f"Missing value at index {index_label}, column {col_label}."
)

# we can simply reshape if we don't have a mask
if mask_all and len(values):
# TODO: Under what circumstances can we rely on sorted_values
Expand Down Expand Up @@ -457,7 +486,11 @@ def new_index(self) -> MultiIndex | Index:


def _unstack_multiple(
data: Series | DataFrame, clocs, fill_value=None, sort: bool = True
data: Series | DataFrame,
clocs,
fill_value=None,
sort: bool = True,
no_fill: bool = False,
):
if len(clocs) == 0:
return data
Expand Down Expand Up @@ -503,7 +536,9 @@ def _unstack_multiple(
dummy = data.copy(deep=False)
dummy.index = dummy_index

unstacked = dummy.unstack("__placeholder__", fill_value=fill_value, sort=sort)
unstacked = dummy.unstack(
"__placeholder__", fill_value=fill_value, sort=sort, no_fill=no_fill
)
new_levels = clevels
new_names = cnames
new_codes = recons_codes
Expand All @@ -515,7 +550,7 @@ def _unstack_multiple(
# error: Incompatible types in assignment (expression has type
# "DataFrame | Series", variable has type "DataFrame")
result = result.unstack( # type: ignore[assignment]
val, fill_value=fill_value, sort=sort
val, fill_value=fill_value, sort=sort, no_fill=no_fill
)
clocs = [v if v < val else v - 1 for v in clocs]

Expand All @@ -528,7 +563,7 @@ def _unstack_multiple(
# error: Incompatible types in assignment (expression has type "DataFrame |
# Series", variable has type "DataFrame")
unstacked = dummy_df.unstack( # type: ignore[assignment]
"__placeholder__", fill_value=fill_value, sort=sort
"__placeholder__", fill_value=fill_value, sort=sort, no_fill=no_fill
)
if isinstance(unstacked, Series):
unstcols = unstacked.index
Expand All @@ -554,23 +589,35 @@ def _unstack_multiple(


@overload
def unstack(obj: Series, level, fill_value=..., sort: bool = ...) -> DataFrame: ...
def unstack(
obj: Series, level, fill_value=..., sort: bool = ..., no_fill: bool = ...
) -> DataFrame: ...


@overload
def unstack(
obj: Series | DataFrame, level, fill_value=..., sort: bool = ...
obj: Series | DataFrame,
level,
fill_value=...,
sort: bool = ...,
no_fill: bool = ...,
) -> Series | DataFrame: ...


def unstack(
obj: Series | DataFrame, level, fill_value=None, sort: bool = True
obj: Series | DataFrame,
level,
fill_value=None,
sort: bool = True,
no_fill: bool = False,
) -> Series | DataFrame:
if isinstance(level, (tuple, list)):
if len(level) != 1:
# _unstack_multiple only handles MultiIndexes,
# and isn't needed for a single level
return _unstack_multiple(obj, level, fill_value=fill_value, sort=sort)
return _unstack_multiple(
obj, level, fill_value=fill_value, sort=sort, no_fill=no_fill
)
else:
level = level[0]

Expand All @@ -580,7 +627,9 @@ def unstack(

if isinstance(obj, DataFrame):
if isinstance(obj.index, MultiIndex):
return _unstack_frame(obj, level, fill_value=fill_value, sort=sort)
return _unstack_frame(
obj, level, fill_value=fill_value, sort=sort, no_fill=no_fill
)
else:
return obj.T.stack()
elif not isinstance(obj.index, MultiIndex):
Expand All @@ -592,19 +641,25 @@ def unstack(
)
else:
if is_1d_only_ea_dtype(obj.dtype):
return _unstack_extension_series(obj, level, fill_value, sort=sort)
return _unstack_extension_series(
obj, level, fill_value, sort=sort, no_fill=no_fill
)
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor_expanddim, sort=sort
obj.index,
level=level,
constructor=obj._constructor_expanddim,
sort=sort,
no_fill=no_fill,
)
return unstacker.get_result(obj, value_columns=None, fill_value=fill_value)


def _unstack_frame(
obj: DataFrame, level, fill_value=None, sort: bool = True
obj: DataFrame, level, fill_value=None, sort: bool = True, no_fill: bool = False
) -> DataFrame:
assert isinstance(obj.index, MultiIndex) # checked by caller
unstacker = _Unstacker(
obj.index, level=level, constructor=obj._constructor, sort=sort
obj.index, level=level, constructor=obj._constructor, sort=sort, no_fill=no_fill
)

if not obj._can_fast_transpose:
Expand All @@ -617,7 +672,7 @@ def _unstack_frame(


def _unstack_extension_series(
series: Series, level, fill_value, sort: bool
series: Series, level, fill_value, sort: bool, no_fill: bool = False
) -> DataFrame:
"""
Unstack an ExtensionArray-backed Series.
Expand All @@ -636,6 +691,8 @@ def _unstack_extension_series(
``series.values.take``.
sort : bool
Whether to sort the resulting MuliIndex levels
no_fill : bool, default False
Whether to raise an error if any missing values are encountered

Returns
-------
Expand All @@ -645,7 +702,7 @@ def _unstack_extension_series(
"""
# Defer to the logic in ExtensionBlock._unstack
df = series.to_frame()
result = df.unstack(level=level, fill_value=fill_value, sort=sort)
result = df.unstack(level=level, fill_value=fill_value, sort=sort, no_fill=no_fill)

# equiv: result.droplevel(level=0, axis=1)
# but this avoids an extra copy
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4282,6 +4282,7 @@ def unstack(
level: IndexLabel = -1,
fill_value: Hashable | None = None,
sort: bool = True,
no_fill: bool = False,
) -> DataFrame:
"""
Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
Expand All @@ -4294,6 +4295,10 @@ def unstack(
Value to use when replacing NaN values.
sort : bool, default True
Sort the level(s) in the resulting MultiIndex columns.
no_fill : bool, default False
If True, raise a ValueError if any missing values would need to be filled.
This is useful to ensure data integrity when you expect a complete
1:1 mapping between stacked and unstacked representations.

Returns
-------
Expand Down Expand Up @@ -4333,7 +4338,7 @@ def unstack(
"""
from pandas.core.reshape.reshape import unstack

return unstack(self, level, fill_value, sort)
return unstack(self, level, fill_value, sort, no_fill)

# ----------------------------------------------------------------------
# function application
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -2779,3 +2779,56 @@ def test_stack_preserves_na(dtype, na_value, test_multiindex):
)
expected = Series(1, index=expected_index)
tm.assert_series_equal(result, expected)


def test_unstack_no_fill_complete_data():
df = DataFrame(
{"value": [1, 2, 3, 4]},
index=MultiIndex.from_product([["A", "B"], ["x", "y"]]),
)

result = df.unstack(level=-1, no_fill=True)
expected = DataFrame(
[[1, 2], [3, 4]],
index=["A", "B"],
columns=MultiIndex.from_tuples([("value", "x"), ("value", "y")]),
)
tm.assert_frame_equal(result, expected)


def test_unstack_no_fill_incomplete_data():
df = DataFrame(
{"value": [1, 2, 3]},
index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]),
)

# Should raise ValueError when no_fill=True and filling is required
msg = "Cannot unstack with no_fill=True because filling is required"
with pytest.raises(ValueError, match=msg):
df.unstack(level=-1, no_fill=True)


def test_unstack_no_fill_default_behavior():
df = DataFrame(
{"value": [1, 2, 3]},
index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]),
)

result = df.unstack(level=-1, no_fill=False)
expected = DataFrame(
[[1.0, 2.0], [3.0, np.nan]],
index=["A", "B"],
columns=MultiIndex.from_tuples([("value", "x"), ("value", "y")]),
)
tm.assert_frame_equal(result, expected)


def test_unstack_no_fill_with_fill_value():
df = DataFrame(
{"value": [1, 2, 3]},
index=MultiIndex.from_tuples([("A", "x"), ("A", "y"), ("B", "x")]),
)

msg = "Cannot unstack with no_fill=True because filling is required"
with pytest.raises(ValueError, match=msg):
df.unstack(level=-1, fill_value=0, no_fill=True)
35 changes: 35 additions & 0 deletions pandas/tests/series/methods/test_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,38 @@ def test_unstack_mixed_level_names():
index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]),
)
tm.assert_frame_equal(result, expected)


def test_unstack_no_fill_complete_data():
index = MultiIndex.from_product([["one", "two"], ["a", "b"]])
ser = Series(np.arange(1.0, 5.0), index=index)

result = ser.unstack(level=-1, no_fill=True)
expected = DataFrame(
[[1.0, 2.0], [3.0, 4.0]],
index=["one", "two"],
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)


def test_unstack_no_fill_incomplete_data():
index = MultiIndex.from_tuples([("one", "a"), ("one", "b"), ("two", "a")])
ser = Series([1, 2, 3], index=index)

msg = "Cannot unstack with no_fill=True because filling is required"
with pytest.raises(ValueError, match=msg):
ser.unstack(level=-1, no_fill=True)


def test_unstack_no_fill_default_behavior():
index = MultiIndex.from_tuples([("one", "a"), ("one", "b"), ("two", "a")])
ser = Series([1, 2, 3], index=index)

result = ser.unstack(level=-1, no_fill=False)
expected = DataFrame(
[[1.0, 2.0], [3.0, np.nan]],
index=["one", "two"],
columns=["a", "b"],
)
tm.assert_frame_equal(result, expected)
Loading