DEPR: DataFrame.groupby(axis=1) (#51395)

pandas-dev · Mar 4, 2023 · b391397 · b391397
1 parent ff187c0
commit b391397
Show file tree

Hide file tree

Showing 33 changed files with 303 additions and 102 deletions.
diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
@@ -617,8 +617,8 @@ even if some categories are not present in the data:
     df = pd.DataFrame(
         data=[[1, 2, 3], [4, 5, 6]],
         columns=pd.MultiIndex.from_arrays([["A", "B", "B"], columns]),
-    )
-    df.groupby(axis=1, level=1).sum()
+    ).T
+    df.groupby(level=1).sum()
 
 Groupby will also show "unused" categories:
 

diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -94,15 +94,13 @@ object (more on what the GroupBy object is later), you may do the following:
     )
     speeds
 
-    # default is axis=0
     grouped = speeds.groupby("class")
-    grouped = speeds.groupby("order", axis="columns")
     grouped = speeds.groupby(["class", "order"])
 
 The mapping can be specified many different ways:
 
 * A Python function, to be called on each of the axis labels.
-* A list or NumPy array of the same length as the selected axis.
+* A list or NumPy array of the same length as the index.
 * A dict or ``Series``, providing a ``label -> group name`` mapping.
 * For ``DataFrame`` objects, a string indicating either a column name or
   an index level name to be used to group.
@@ -147,8 +145,8 @@ but the specified columns
    grouped = df2.groupby(level=df2.index.names.difference(["B"]))
    grouped.sum()
 
-These will split the DataFrame on its index (rows). We could also split by the
-columns:
+These will split the DataFrame on its index (rows). To split by columns, first do
+a tranpose:
 
 .. ipython::
 
@@ -159,7 +157,7 @@ columns:
        ...:         return 'consonant'
        ...:
 
-    In [5]: grouped = df.groupby(get_letter_type, axis=1)
+    In [5]: grouped = df.T.groupby(get_letter_type)
 
 pandas :class:`~pandas.Index` objects support duplicate values. If a
 non-unique index is used as the group key in a groupby operation, all values
@@ -254,7 +252,7 @@ above example we have:
 .. ipython:: python
 
    df.groupby("A").groups
-   df.groupby(get_letter_type, axis=1).groups
+   df.T.groupby(get_letter_type).groups
 
 Calling the standard Python ``len`` function on the GroupBy object just returns
 the length of the ``groups`` dict, so it is largely just a convenience:
@@ -496,7 +494,7 @@ An obvious one is aggregation via the
    grouped.aggregate(np.sum)
 
 As you can see, the result of the aggregation will have the group names as the
-new index along the grouped axis. In the case of multiple keys, the result is a
+new index. In the case of multiple keys, the result is a
 :ref:`MultiIndex <advanced.hierarchical>` by default, though this can be
 changed by using the ``as_index`` option:
 
@@ -1556,7 +1554,8 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
 
    df = pd.DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "c": [1, 0, 0], "d": [2, 3, 4]})
    df
-   df.groupby(df.sum(), axis=1).sum()
+   dft = df.T
+   dft.groupby(dft.sum()).sum()
 
 .. _groupby.multicolumn_factorization:
 

diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -350,7 +350,7 @@ some very expressive and fast data manipulations.
    df.stack().mean(1).unstack()
 
    # same result, another way
-   df.groupby(level=1, axis=1).mean()
+   df.T.groupby(level=1).mean()
 
    df.stack().groupby(level=1).mean()
 

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -93,6 +93,7 @@ Other API changes
 Deprecations
 ~~~~~~~~~~~~
 - Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`)
+- Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`)
 - Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`)
 - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`)
 -

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -8247,19 +8247,37 @@ def update(
     def groupby(
         self,
         by=None,
-        axis: Axis = 0,
+        axis: Axis | lib.NoDefault = no_default,
         level: IndexLabel | None = None,
         as_index: bool = True,
         sort: bool = True,
         group_keys: bool = True,
         observed: bool = False,
         dropna: bool = True,
     ) -> DataFrameGroupBy:
+        if axis is not lib.no_default:
+            axis = self._get_axis_number(axis)
+            if axis == 1:
+                warnings.warn(
+                    "DataFrame.groupby with axis=1 is deprecated. Do "
+                    "`frame.T.groupby(...)` without axis instead.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+            else:
+                warnings.warn(
+                    "The 'axis' keyword in DataFrame.groupby is deprecated and "
+                    "will be removed in a future version.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+        else:
+            axis = 0
+
         from pandas.core.groupby.generic import DataFrameGroupBy
 
         if level is None and by is None:
             raise TypeError("You have to supply one of 'by' and 'level'")
-        axis = self._get_axis_number(axis)
 
         return DataFrameGroupBy(
             obj=self,

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -2238,6 +2238,10 @@ def fillna(
             the same results as :meth:`.DataFrame.fillna`. When the
             :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``
             or ``axis=1`` here will produce the same results.
+
+            .. deprecated:: 2.0.0
+                Use frame.T.groupby(...) instead.
+
         inplace : bool, default False
             Broken. Do not set to True.
         limit : int, default None
@@ -2300,15 +2304,15 @@ def fillna(
 
         Propagate non-null values forward or backward within each group along rows.
 
-        >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill")
+        >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="ffill").T
            key    A    B    C
         0  0.0  0.0  2.0  2.0
         1  0.0  2.0  3.0  3.0
         2  1.0  1.0  NaN  2.0
         3  1.0  3.0  NaN  NaN
         4  1.0  1.0  NaN  NaN
 
-        >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill")
+        >>> df.T.groupby(np.array([0, 0, 1, 1])).fillna(method="bfill").T
            key    A    B    C
         0  0.0  NaN  2.0  NaN
         1  0.0  2.0  3.0  NaN

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -3071,9 +3071,10 @@ def _nth(
                 sort=self.sort,
             )
 
-        grb = dropped.groupby(
-            grouper, as_index=self.as_index, sort=self.sort, axis=self.axis
-        )
+        if self.axis == 1:
+            grb = dropped.T.groupby(grouper, as_index=self.as_index, sort=self.sort)
+        else:
+            grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
         return grb.nth(n)
 
     @final
@@ -3882,10 +3883,13 @@ def pct_change(
             fill_method = "ffill"
             limit = 0
         filled = getattr(self, fill_method)(limit=limit)
-        fill_grp = filled.groupby(
-            self.grouper.codes, axis=self.axis, group_keys=self.group_keys
-        )
-        shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
+        if self.axis == 0:
+            fill_grp = filled.groupby(self.grouper.codes, group_keys=self.group_keys)
+        else:
+            fill_grp = filled.T.groupby(self.grouper.codes, group_keys=self.group_keys)
+        shifted = fill_grp.shift(periods=periods, freq=freq)
+        if self.axis == 1:
+            shifted = shifted.T
         return (filled / shifted) - 1
 
     @final

diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -16,6 +16,7 @@
 
 from pandas._config import using_copy_on_write
 
+from pandas._libs import lib
 from pandas._typing import (
     ArrayLike,
     Axis,
@@ -258,10 +259,25 @@ def __init__(
         key=None,
         level=None,
         freq=None,
-        axis: Axis = 0,
+        axis: Axis | lib.NoDefault = lib.no_default,
         sort: bool = False,
         dropna: bool = True,
     ) -> None:
+        if type(self) is Grouper:
+            # i.e. not TimeGrouper
+            if axis is not lib.no_default:
+                warnings.warn(
+                    "Grouper axis keyword is deprecated and will be removed in a "
+                    "future version. To group on axis=1, use obj.T.groupby(...) "
+                    "instead",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+            else:
+                axis = 0
+        if axis is lib.no_default:
+            axis = 0
+
         self.key = key
         self.level = level
         self.freq = freq

diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -1294,7 +1294,11 @@ def _downsample(self, how, **kwargs):
 
         # we are downsampling
         # we want to call the actual grouper method here
-        result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs)
+        if self.axis == 0:
+            result = obj.groupby(self.grouper).aggregate(how, **kwargs)
+        else:
+            # test_resample_axis1
+            result = obj.T.groupby(self.grouper).aggregate(how, **kwargs).T
 
         return self._wrap_result(result)
 

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -377,7 +377,8 @@ def _all_key(key):
             margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
             cat_axis = 1
 
-            for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
+            for key, piece in table.T.groupby(level=0, observed=observed):
+                piece = piece.T
                 all_key = _all_key(key)
 
                 # we are going to mutate this, so need to copy!
@@ -390,7 +391,7 @@ def _all_key(key):
             from pandas import DataFrame
 
             cat_axis = 0
-            for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
+            for key, piece in table.groupby(level=0, observed=observed):
                 if len(cols) > 1:
                     all_key = _all_key(key)
                 else:

diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
@@ -268,9 +268,14 @@ def test_transform_groupby_kernel_frame(request, axis, float_frame, op):
     args = [0.0] if op == "fillna" else []
     if axis in (0, "index"):
         ones = np.ones(float_frame.shape[0])
+        msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
     else:
         ones = np.ones(float_frame.shape[1])
-    expected = float_frame.groupby(ones, axis=axis).transform(op, *args)
+        msg = "DataFrame.groupby with axis=1 is deprecated"
+
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = float_frame.groupby(ones, axis=axis)
+    expected = gb.transform(op, *args)
     result = float_frame.transform(op, axis, *args)
     tm.assert_frame_equal(result, expected)
 
@@ -283,7 +288,9 @@ def test_transform_groupby_kernel_frame(request, axis, float_frame, op):
         ones = np.ones(float_frame.shape[0])
     else:
         ones = np.ones(float_frame.shape[1])
-    expected2 = float_frame.groupby(ones, axis=axis).transform(op, *args)
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb2 = float_frame.groupby(ones, axis=axis)
+    expected2 = gb2.transform(op, *args)
     result2 = float_frame.transform(op, axis, *args)
     tm.assert_frame_equal(result2, expected2)
 

diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -124,7 +124,9 @@ def test_groupby_aggregation_multi_level_column():
         columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
     )
 
-    gb = df.groupby(level=1, axis=1)
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(level=1, axis=1)
     result = gb.sum(numeric_only=False)
     expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
 
@@ -253,7 +255,11 @@ def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype
         [[1, 2, 3, 4, 5, 6]] * 3,
         columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
     ).astype({("a", "j"): dtype, ("b", "j"): dtype})
-    result = df.groupby(level=1, axis=1).agg(func)
+
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby(level=1, axis=1)
+    result = gb.agg(func)
     expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
         result_dtype_dict
     )
@@ -278,7 +284,11 @@ def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
         columns=Index([10, 20, 10, 20], name="x"),
         dtype="int64",
     ).astype({10: "Int64"})
-    result = df.groupby("x", axis=1).agg(func)
+
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby("x", axis=1)
+    result = gb.agg(func)
     expected = DataFrame(
         data=expected_data,
         index=Index([0, 1, 0], name="y"),
@@ -1447,7 +1457,9 @@ def test_groupby_complex_raises(func):
 def test_multi_axis_1_raises(func):
     # GH#46995
     df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]})
-    gb = df.groupby("a", axis=1)
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        gb = df.groupby("a", axis=1)
     with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"):
         gb.agg(func)
 

diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
@@ -122,10 +122,15 @@ def test_cython_agg_frame_columns():
     # #2113
     df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
 
-    df.groupby(level=0, axis="columns").mean()
-    df.groupby(level=0, axis="columns").mean()
-    df.groupby(level=0, axis="columns").mean()
-    df.groupby(level=0, axis="columns").mean()
+    msg = "DataFrame.groupby with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        df.groupby(level=0, axis="columns").mean()
 
 
 def test_cython_agg_return_dict():

diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py
@@ -76,11 +76,15 @@ def test_regression_allowlist_methods(raw_frame, op, axis, skipna, sort):
     # explicitly test the allowlist methods
     if axis == 0:
         frame = raw_frame
+        msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be"
     else:
         frame = raw_frame.T
+        msg = "DataFrame.groupby with axis=1 is deprecated"
 
-    if op in AGG_FUNCTIONS_WITH_SKIPNA:
+    with tm.assert_produces_warning(FutureWarning, match=msg):
         grouped = frame.groupby(level=0, axis=axis, sort=sort)
+
+    if op in AGG_FUNCTIONS_WITH_SKIPNA:
         result = getattr(grouped, op)(skipna=skipna)
         expected = frame.groupby(level=0).apply(
             lambda h: getattr(h, op)(axis=axis, skipna=skipna)
@@ -89,7 +93,6 @@ def test_regression_allowlist_methods(raw_frame, op, axis, skipna, sort):
             expected = expected.sort_index(axis=axis)
         tm.assert_frame_equal(result, expected)
     else:
-        grouped = frame.groupby(level=0, axis=axis, sort=sort)
         result = getattr(grouped, op)()
         expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis))
         if sort: