ENH: Add new implementation of DataFrame.stack (#53921)

* DEPR: Add new implementation of DataFrame.stack and deprecate old * Merge cleanup * Revert filterwarnings in conf.py * Merge fixup * Rename inner function * v3->future_stack; other refinements * Fixup docstring * Docstring fixup
pandas-dev · Aug 2, 2023 · 263828c · 263828c
1 parent 46386f0
commit 263828c
Show file tree

Hide file tree

Showing 28 changed files with 662 additions and 275 deletions.
diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -438,7 +438,7 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent:
    )
 
    pd.melt(cheese, id_vars=["first", "last"])
-   cheese.set_index(["first", "last"]).stack()  # alternative way
+   cheese.set_index(["first", "last"]).stack(future_stack=True)  # alternative way
 
 For more details and examples see :ref:`the reshaping documentation
 <reshaping.melt>`.

diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -579,7 +579,7 @@ columns:
 
 .. ipython:: python
 
-   stacked = df2.stack()
+   stacked = df2.stack(future_stack=True)
    stacked
 
 With a "stacked" DataFrame or Series (having a :class:`MultiIndex` as the

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
@@ -311,7 +311,7 @@ The :ref:`multindexing <advanced.hierarchical>` docs.
    df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns])
    df
    # Now stack & Reset
-   df = df.stack(0).reset_index(1)
+   df = df.stack(0, future_stack=True).reset_index(1)
    df
    # And fix the labels (Notice the label 'level_1' got added automatically)
    df.columns = ["Sample", "All_X", "All_Y"]
@@ -688,7 +688,7 @@ The :ref:`Pivot <reshaping.pivot>` docs.
        aggfunc="sum",
        margins=True,
    )
-   table.stack("City")
+   table.stack("City", future_stack=True)
 
 `Frequency table like plyr in R
 <https://stackoverflow.com/questions/15589354/frequency-tables-in-pandas-like-plyr-in-r>`__

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -1713,4 +1713,4 @@ column index name will be used as the name of the inserted column:
 
    result
 
-   result.stack()
+   result.stack(future_stack=True)
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -127,7 +127,7 @@ stacked level becomes the new lowest level in a :class:`MultiIndex` on the colum
 
 .. ipython:: python
 
-   stacked = df2.stack()
+   stacked = df2.stack(future_stack=True)
    stacked
 
 With a "stacked" :class:`DataFrame` or :class:`Series` (having a :class:`MultiIndex` as the
@@ -163,7 +163,7 @@ will result in a **sorted** copy of the original :class:`DataFrame` or :class:`S
    index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]])
    df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"])
    df
-   all(df.unstack().stack() == df.sort_index())
+   all(df.unstack().stack(future_stack=True) == df.sort_index())
 
 The above code will raise a ``TypeError`` if the call to :meth:`~DataFrame.sort_index` is
 removed.
@@ -191,16 +191,16 @@ processed individually.
     df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
     df
 
-    df.stack(level=["animal", "hair_length"])
+    df.stack(level=["animal", "hair_length"], future_stack=True)
 
 The list of levels can contain either level names or level numbers (but
 not a mixture of the two).
 
 .. ipython:: python
 
-    # df.stack(level=['animal', 'hair_length'])
+    # df.stack(level=['animal', 'hair_length'], future_stack=True)
     # from above is equivalent to:
-    df.stack(level=[1, 2])
+    df.stack(level=[1, 2], future_stack=True)
 
 Missing data
 ~~~~~~~~~~~~
@@ -233,8 +233,8 @@ which level in the columns to stack:
 
 .. ipython:: python
 
-   df2.stack("exp")
-   df2.stack("animal")
+   df2.stack("exp", future_stack=True)
+   df2.stack("animal", future_stack=True)
 
 Unstacking can result in missing values if subgroups do not have the same
 set of labels.  By default, missing values will be replaced with the default
@@ -345,12 +345,12 @@ some very expressive and fast data manipulations.
 .. ipython:: python
 
    df
-   df.stack().mean(1).unstack()
+   df.stack(future_stack=True).mean(1).unstack()
 
    # same result, another way
    df.T.groupby(level=1).mean()
 
-   df.stack().groupby(level=1).mean()
+   df.stack(future_stack=True).groupby(level=1).mean()
 
    df.mean().unstack(0)
 
@@ -460,7 +460,7 @@ as having a multi-level index:
 
 .. ipython:: python
 
-    table.stack()
+    table.stack(future_stack=True)
 
 .. _reshaping.crosstabulations:
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -78,7 +78,7 @@ Copy-on-Write improvements
   - DataFrame.fillna / Series.fillna
   - DataFrame.replace / Series.replace
 
-.. _whatsnew_210.enhancements.enhancement2:
+.. _whatsnew_210.enhancements.map_na_action:
 
 ``map(func, na_action="ignore")`` now works for all array types
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -128,6 +128,45 @@ Also, note that :meth:`Categorical.map` implicitly has had its ``na_action`` set
 This has been deprecated and will :meth:`Categorical.map` in the future change the default
 to ``na_action=None``, like for all the other array types.
 
+.. _whatsnew_210.enhancements.new_stack:
+
+New implementation of :meth:`DataFrame.stack`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+pandas has reimplemented :meth:`DataFrame.stack`. To use the new implementation, pass the argument ``future_stack=True``. This will become the only option in pandas 3.0.
+
+The previous implementation had two main behavioral downsides.
+
+ 1. The previous implementation would unnecessarily introduce NA values into the result. The user could have NA values automatically removed by passing ``dropna=True`` (the default), but doing this could also remove NA values from the result that existed in the input. See the examples below.
+ 2. The previous implementation with ``sort=True`` (the default) would sometimes sort part of the resulting index, and sometimes not. If the input's columns are *not* a :class:`MultiIndex`, then the resulting index would never be sorted. If the columns are a :class:`MultiIndex`, then in most cases the level(s) in the resulting index that come from stacking the column level(s) would be sorted. In rare cases such level(s) would be sorted in a non-standard order, depending on how the columns were created.
+
+The new implementation (``future_stack=True``) will no longer unnecessarily introduce NA values when stacking multiple levels and will never sort. As such, the arguments ``dropna`` and ``sort`` are not utilized and must remain unspecified when using ``future_stack=True``. These arguments will be removed in the next major release.
+
+.. ipython:: python
+
+    columns = pd.MultiIndex.from_tuples([("B", "d"), ("A", "c")])
+    df = pd.DataFrame([[0, 2], [1, 3]], index=["z", "y"], columns=columns)
+    df
+
+In the previous version (``future_stack=False``), the default of ``dropna=True`` would remove unnecessarily introduced NA values but still coerce the dtype to ``float64`` in the process. In the new version, no NAs are introduced and so there is no coercion of the dtype.
+
+.. ipython:: python
+    :okwarning:
+
+    df.stack([0, 1], future_stack=False, dropna=True)
+    df.stack([0, 1], future_stack=True)
+
+If the input contains NA values, the previous version would drop those as well with ``dropna=True`` or introduce new NA values with ``dropna=False``. The new version persists all values from the input.
+
+.. ipython:: python
+    :okwarning:
+
+    df = pd.DataFrame([[0, 2], [np.nan, np.nan]], columns=columns)
+    df
+    df.stack([0, 1], future_stack=False, dropna=True)
+    df.stack([0, 1], future_stack=False, dropna=False)
+    df.stack([0, 1], future_stack=True)
+
 .. _whatsnew_210.enhancements.other:
 
 Other enhancements

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9166,7 +9166,13 @@ def pivot_table(
             sort=sort,
         )
 
-    def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
+    def stack(
+        self,
+        level: IndexLabel = -1,
+        dropna: bool | lib.NoDefault = lib.no_default,
+        sort: bool | lib.NoDefault = lib.no_default,
+        future_stack: bool = False,
+    ):
         """
         Stack the prescribed level(s) from columns to index.
 
@@ -9194,6 +9200,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
             section.
         sort : bool, default True
             Whether to sort the levels of the resulting MultiIndex.
+        future_stack : bool, default False
+            Whether to use the new implementation that will replace the current
+            implementation in pandas 3.0. When True, dropna and sort have no impact
+            on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release
+            notes <whatsnew_210.enhancements.new_stack>` for more details.
 
         Returns
         -------
@@ -9233,7 +9244,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
              weight height
         cat       0      1
         dog       2      3
-        >>> df_single_level_cols.stack()
+        >>> df_single_level_cols.stack(future_stack=True)
         cat  weight    0
              height    1
         dog  weight    2
@@ -9255,7 +9266,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
                  kg    pounds
         cat       1        2
         dog       2        4
-        >>> df_multi_level_cols1.stack()
+        >>> df_multi_level_cols1.stack(future_stack=True)
                     weight
         cat kg           1
             pounds       2
@@ -9280,7 +9291,7 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
                 kg      m
         cat    1.0    2.0
         dog    3.0    4.0
-        >>> df_multi_level_cols2.stack()
+        >>> df_multi_level_cols2.stack(future_stack=True)
                 weight  height
         cat kg     1.0     NaN
             m      NaN     2.0
@@ -9291,17 +9302,17 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
 
         The first parameter controls which level or levels are stacked:
 
-        >>> df_multi_level_cols2.stack(0)
+        >>> df_multi_level_cols2.stack(0, future_stack=True)
                      kg    m
-        cat height  NaN  2.0
-            weight  1.0  NaN
-        dog height  NaN  4.0
-            weight  3.0  NaN
-        >>> df_multi_level_cols2.stack([0, 1])
-        cat  height  m     2.0
-             weight  kg    1.0
-        dog  height  m     4.0
-             weight  kg    3.0
+        cat weight  1.0  NaN
+            height  NaN  2.0
+        dog weight  3.0  NaN
+            height  NaN  4.0
+        >>> df_multi_level_cols2.stack([0, 1], future_stack=True)
+        cat  weight  kg    1.0
+             height  m     2.0
+        dog  weight  kg    3.0
+             height  m     4.0
         dtype: float64
 
         **Dropping missing values**
@@ -9331,15 +9342,52 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
         dog kg     2.0     NaN
             m      NaN     3.0
         """
-        from pandas.core.reshape.reshape import (
-            stack,
-            stack_multiple,
-        )
+        if not future_stack:
+            from pandas.core.reshape.reshape import (
+                stack,
+                stack_multiple,
+            )
+
+            if dropna is lib.no_default:
+                dropna = True
+            if sort is lib.no_default:
+                sort = True
 
-        if isinstance(level, (tuple, list)):
-            result = stack_multiple(self, level, dropna=dropna, sort=sort)
+            if isinstance(level, (tuple, list)):
+                result = stack_multiple(self, level, dropna=dropna, sort=sort)
+            else:
+                result = stack(self, level, dropna=dropna, sort=sort)
         else:
-            result = stack(self, level, dropna=dropna, sort=sort)
+            from pandas.core.reshape.reshape import stack_v3
+
+            if dropna is not lib.no_default:
+                raise ValueError(
+                    "dropna must be unspecified with future_stack=True as the new "
+                    "implementation does not introduce rows of NA values. This "
+                    "argument will be removed in a future version of pandas."
+                )
+
+            if sort is not lib.no_default:
+                raise ValueError(
+                    "Cannot specify sort with future_stack=True, this argument will be "
+                    "removed in a future version of pandas. Sort the result using "
+                    ".sort_index instead."
+                )
+
+            if (
+                isinstance(level, (tuple, list))
+                and not all(lev in self.columns.names for lev in level)
+                and not all(isinstance(lev, int) for lev in level)
+            ):
+                raise ValueError(
+                    "level should contain all level names or all level "
+                    "numbers, not a mixture of the two."
+                )
+
+            if not isinstance(level, (tuple, list)):
+                level = [level]
+            level = [self.columns._get_level_number(lev) for lev in level]
+            result = stack_v3(self, level)
 
         return result.__finalize__(self, method="stack")
 

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -416,7 +416,7 @@ def _wrap_applied_output(
             res_df = self._reindex_output(res_df)
             # if self.observed is False,
             # keep all-NaN rows created while re-indexing
-            res_ser = res_df.stack(dropna=self.observed)
+            res_ser = res_df.stack(future_stack=True)
             res_ser.name = self.obj.name
             return res_ser
         elif isinstance(values[0], (Series, DataFrame)):

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2440,6 +2440,10 @@ def reorder_levels(self, order) -> MultiIndex:
                    names=['y', 'x'])
         """
         order = [self._get_level_number(i) for i in order]
+        result = self._reorder_ilevels(order)
+        return result
+
+    def _reorder_ilevels(self, order) -> MultiIndex:
         if len(order) != self.nlevels:
             raise AssertionError(
                 f"Length of order must be same as number of levels ({self.nlevels}), "

diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -1497,7 +1497,7 @@ def size(self):
         # If the result is a non-empty DataFrame we stack to get a Series
         # GH 46826
         if isinstance(result, ABCDataFrame) and not result.empty:
-            result = result.stack()
+            result = result.stack(future_stack=True)
 
         if not len(self.ax):
             from pandas import Series

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -418,7 +418,7 @@ def _all_key(key):
 
     if len(cols) > 0:
         row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
-        row_margin = row_margin.stack()
+        row_margin = row_margin.stack(future_stack=True)
 
         # slight hack
         new_order = [len(cols)] + list(range(len(cols)))