Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deprecate level keyword for dataframe and series aggregations #40869

Merged
merged 6 commits into from
Apr 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/user_guide/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ values across a level. For instance:
)
df = pd.DataFrame(np.random.randn(4, 2), index=midx)
df
df2 = df.mean(level=0)
df2 = df.groupby(level=0).mean()
df2
df2.reindex(df.index, level=0)

Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ even if some categories are not present in the data:
data=[[1, 2, 3], [4, 5, 6]],
columns=pd.MultiIndex.from_arrays([["A", "B", "B"], columns]),
)
df.sum(axis=1, level=1)
df.groupby(axis=1, level=1).sum()

Groupby will also show "unused" categories:

Expand Down
8 changes: 0 additions & 8 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -320,14 +320,6 @@ number:

s.groupby(level="second").sum()

The aggregation functions such as ``sum`` will take the level parameter
directly. Additionally, the resulting index will be named according to the
chosen level:

.. ipython:: python

s.sum(level="second")

Grouping with multiple levels is supported.

.. ipython:: python
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.15.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ Other enhancements:
- ``Series.all`` and ``Series.any`` now support the ``level`` and ``skipna`` parameters (:issue:`8302`):

.. ipython:: python
:okwarning:

s = pd.Series([False, True, False], index=[0, 0, 1])
s.any(level=0)
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ Deprecations
- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`)
- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`)
- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`)
- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`)

.. ---------------------------------------------------------------------------

Expand Down
7 changes: 7 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9479,6 +9479,13 @@ def count(
"""
axis = self._get_axis_number(axis)
if level is not None:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
"deprecated and will be removed in a future version. Use groupby "
jreback marked this conversation as resolved.
Show resolved Hide resolved
"instead. df.count(level=1) should use df.groupby(level=1).count().",
FutureWarning,
stacklevel=2,
)
return self._count_level(level, axis=axis, numeric_only=numeric_only)

if numeric_only:
Expand Down
35 changes: 35 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10368,6 +10368,13 @@ def _logical_func(
):
nv.validate_logical_func((), kwargs, fname=name)
if level is not None:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
jreback marked this conversation as resolved.
Show resolved Hide resolved
"deprecated and will be removed in a future version. Use groupby "
"instead. df.any(level=1) should use df.groupby(level=1).any()",
FutureWarning,
stacklevel=4,
)
if bool_only is not None:
raise NotImplementedError(
"Option bool_only is not implemented with option level."
Expand Down Expand Up @@ -10459,6 +10466,13 @@ def _stat_function_ddof(
if axis is None:
axis = self._stat_axis_number
if level is not None:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
"deprecated and will be removed in a future version. Use groupby "
"instead. df.var(level=1) should use df.groupby(level=1).var().",
FutureWarning,
stacklevel=4,
)
return self._agg_by_level(
name, axis=axis, level=level, skipna=skipna, ddof=ddof
)
Expand Down Expand Up @@ -10507,6 +10521,13 @@ def _stat_function(
if axis is None:
axis = self._stat_axis_number
if level is not None:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
"deprecated and will be removed in a future version. Use groupby "
"instead. df.median(level=1) should use df.groupby(level=1).median().",
FutureWarning,
stacklevel=4,
)
return self._agg_by_level(
name, axis=axis, level=level, skipna=skipna, numeric_only=numeric_only
)
Expand Down Expand Up @@ -10569,6 +10590,13 @@ def _min_count_stat_function(
if axis is None:
axis = self._stat_axis_number
if level is not None:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
"deprecated and will be removed in a future version. Use groupby "
"instead. df.sum(level=1) should use df.groupby(level=1).sum().",
FutureWarning,
stacklevel=4,
)
return self._agg_by_level(
name,
axis=axis,
Expand Down Expand Up @@ -10646,6 +10674,13 @@ def mad(self, axis=None, skipna=None, level=None):
if axis is None:
axis = self._stat_axis_number
if level is not None:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
"deprecated and will be removed in a future version. Use groupby "
"instead. df.mad(level=1) should use df.groupby(level=1).mad()",
FutureWarning,
stacklevel=3,
)
return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna)

data = self._get_numeric_data()
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1894,8 +1894,16 @@ def count(self, level=None):
"""
if level is None:
return notna(self._values).sum()
elif not isinstance(self.index, MultiIndex):
raise ValueError("Series.count level is only valid with a MultiIndex")
else:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
"deprecated and will be removed in a future version. Use groupby "
"instead. ser.count(level=1) should use ser.groupby(level=1).count().",
FutureWarning,
jreback marked this conversation as resolved.
Show resolved Hide resolved
stacklevel=2,
)
if not isinstance(self.index, MultiIndex):
raise ValueError("Series.count level is only valid with a MultiIndex")

index = self.index
assert isinstance(index, MultiIndex) # for mypy
Expand Down
104 changes: 0 additions & 104 deletions pandas/tests/frame/methods/test_count.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,11 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm


class TestDataFrameCount:
def test_count_multiindex(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data

frame = frame.copy()
frame.index.names = ["a", "b"]

result = frame.count(level="b")
expected = frame.count(level=1)
tm.assert_frame_equal(result, expected, check_names=False)

result = frame.count(level="a")
expected = frame.count(level=0)
tm.assert_frame_equal(result, expected, check_names=False)

msg = "Level x not found"
with pytest.raises(KeyError, match=msg):
frame.count(level="x")

def test_count(self):
# corner case
frame = DataFrame()
Expand Down Expand Up @@ -59,85 +37,3 @@ def test_count_objects(self, float_string_frame):

tm.assert_series_equal(dm.count(), df.count())
tm.assert_series_equal(dm.count(1), df.count(1))

def test_count_level_corner(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data

ser = frame["A"][:0]
result = ser.count(level=0)
expected = Series(0, index=ser.index.levels[0], name="A")
tm.assert_series_equal(result, expected)

df = frame[:0]
result = df.count(level=0)
expected = (
DataFrame(
index=ser.index.levels[0].set_names(["first"]), columns=df.columns
)
.fillna(0)
.astype(np.int64)
)
tm.assert_frame_equal(result, expected)

def test_count_index_with_nan(self):
# https://github.com/pandas-dev/pandas/issues/21824
df = DataFrame(
{
"Person": ["John", "Myla", None, "John", "Myla"],
"Age": [24.0, 5, 21.0, 33, 26],
"Single": [False, True, True, True, False],
}
)

# count on row labels
res = df.set_index(["Person", "Single"]).count(level="Person")
expected = DataFrame(
index=Index(["John", "Myla"], name="Person"),
columns=Index(["Age"]),
data=[2, 2],
)
tm.assert_frame_equal(res, expected)

# count on column labels
res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1)
expected = DataFrame(
columns=Index(["John", "Myla"], name="Person"),
index=Index(["Age"]),
data=[[2, 2]],
)
tm.assert_frame_equal(res, expected)

def test_count_level(
self,
multiindex_year_month_day_dataframe_random_data,
multiindex_dataframe_random_data,
):
ymd = multiindex_year_month_day_dataframe_random_data
frame = multiindex_dataframe_random_data

def _check_counts(frame, axis=0):
index = frame._get_axis(axis)
for i in range(index.nlevels):
result = frame.count(axis=axis, level=i)
expected = frame.groupby(axis=axis, level=i).count()
expected = expected.reindex_like(result).astype("i8")
tm.assert_frame_equal(result, expected)

frame.iloc[1, [1, 2]] = np.nan
frame.iloc[7, [0, 1]] = np.nan
ymd.iloc[1, [1, 2]] = np.nan
ymd.iloc[7, [0, 1]] = np.nan

_check_counts(frame)
_check_counts(ymd)
_check_counts(frame.T, axis=1)
_check_counts(ymd.T, axis=1)

# can't call with level on regular DataFrame
df = tm.makeTimeDataFrame()
with pytest.raises(TypeError, match="hierarchical"):
df.count(level=0)

frame["D"] = "foo"
result = frame.count(level=0, numeric_only=True)
tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp"))
123 changes: 123 additions & 0 deletions pandas/tests/frame/methods/test_count_with_level_deprecated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm


class TestDataFrameCount:
def test_count_multiindex(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data

frame = frame.copy()
frame.index.names = ["a", "b"]

with tm.assert_produces_warning(FutureWarning):
result = frame.count(level="b")
with tm.assert_produces_warning(FutureWarning):
expected = frame.count(level=1)
tm.assert_frame_equal(result, expected, check_names=False)

with tm.assert_produces_warning(FutureWarning):
result = frame.count(level="a")
with tm.assert_produces_warning(FutureWarning):
expected = frame.count(level=0)
tm.assert_frame_equal(result, expected, check_names=False)

msg = "Level x not found"
with pytest.raises(KeyError, match=msg):
with tm.assert_produces_warning(FutureWarning):
frame.count(level="x")

def test_count_level_corner(self, multiindex_dataframe_random_data):
frame = multiindex_dataframe_random_data

ser = frame["A"][:0]
with tm.assert_produces_warning(FutureWarning):
result = ser.count(level=0)
expected = Series(0, index=ser.index.levels[0], name="A")
tm.assert_series_equal(result, expected)

df = frame[:0]
with tm.assert_produces_warning(FutureWarning):
result = df.count(level=0)
expected = (
DataFrame(
index=ser.index.levels[0].set_names(["first"]), columns=df.columns
)
.fillna(0)
.astype(np.int64)
)
tm.assert_frame_equal(result, expected)

def test_count_index_with_nan(self):
# https://github.com/pandas-dev/pandas/issues/21824
df = DataFrame(
{
"Person": ["John", "Myla", None, "John", "Myla"],
"Age": [24.0, 5, 21.0, 33, 26],
"Single": [False, True, True, True, False],
}
)

# count on row labels
with tm.assert_produces_warning(FutureWarning):
res = df.set_index(["Person", "Single"]).count(level="Person")
expected = DataFrame(
index=Index(["John", "Myla"], name="Person"),
columns=Index(["Age"]),
data=[2, 2],
)
tm.assert_frame_equal(res, expected)

# count on column labels
with tm.assert_produces_warning(FutureWarning):
res = df.set_index(["Person", "Single"]).T.count(level="Person", axis=1)
expected = DataFrame(
columns=Index(["John", "Myla"], name="Person"),
index=Index(["Age"]),
data=[[2, 2]],
)
tm.assert_frame_equal(res, expected)

def test_count_level(
self,
multiindex_year_month_day_dataframe_random_data,
multiindex_dataframe_random_data,
):
ymd = multiindex_year_month_day_dataframe_random_data
frame = multiindex_dataframe_random_data

def _check_counts(frame, axis=0):
index = frame._get_axis(axis)
for i in range(index.nlevels):
with tm.assert_produces_warning(FutureWarning):
result = frame.count(axis=axis, level=i)
expected = frame.groupby(axis=axis, level=i).count()
expected = expected.reindex_like(result).astype("i8")
tm.assert_frame_equal(result, expected)

frame.iloc[1, [1, 2]] = np.nan
frame.iloc[7, [0, 1]] = np.nan
ymd.iloc[1, [1, 2]] = np.nan
ymd.iloc[7, [0, 1]] = np.nan

_check_counts(frame)
_check_counts(ymd)
_check_counts(frame.T, axis=1)
_check_counts(ymd.T, axis=1)

# can't call with level on regular DataFrame
df = tm.makeTimeDataFrame()
with pytest.raises(TypeError, match="hierarchical"):
with tm.assert_produces_warning(FutureWarning):
df.count(level=0)

frame["D"] = "foo"
with tm.assert_produces_warning(FutureWarning):
result = frame.count(level=0, numeric_only=True)
tm.assert_index_equal(result.columns, Index(list("ABC"), name="exp"))