Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: Decouple Series.apply from Series.agg #53400

Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Expand Up @@ -99,6 +99,7 @@ Other enhancements
- :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`)
- :meth:`DataFrame.stack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`)
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`)
- Added a new parameter ``array_ops_only`` to :meth:`Series.apply`. When set to ``True`` the supplied callables will always operate on the whole Series (:issue:`53400`).
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by_row now; not array_ops_only.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, changed.

- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`)
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`)
-
Expand Down
59 changes: 44 additions & 15 deletions pandas/core/apply.py
Expand Up @@ -16,6 +16,7 @@
Iterable,
Iterator,
List,
Literal,
Sequence,
cast,
)
Expand Down Expand Up @@ -288,6 +289,11 @@ def agg_list_like(self) -> DataFrame | Series:
-------
Result of aggregation.
"""
return self.agg_or_apply_list_like(op_name="agg")

def agg_or_apply_list_like(
self, op_name: Literal["agg", "apply"]
) -> DataFrame | Series:
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
Expand All @@ -296,6 +302,9 @@ def agg_list_like(self) -> DataFrame | Series:

obj = self.obj
func = cast(List[AggFuncTypeBase], self.func)
kwargs = self.kwargs
if op_name == "apply":
kwargs = {**kwargs, "by_row": False}
Comment on lines +306 to +307
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@topper-123: shouldn't by_row here be True for backwards compatibility?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, I'm thinking this should now be self.by_row when that attribute exists. If a user calls ser.apply(["sum", "mean"], by_row=True) (or with by_row=False), shouldn't we be passing the argument down to the next call to apply?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you are right. I'll make a new PR on that.


if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand All @@ -313,8 +322,6 @@ def agg_list_like(self) -> DataFrame | Series:
keys = []

is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
is_ser_or_df = isinstance(obj, (ABCDataFrame, ABCSeries))
this_args = [self.axis, *self.args] if is_ser_or_df else self.args

context_manager: ContextManager
if is_groupby:
Expand All @@ -323,12 +330,19 @@ def agg_list_like(self) -> DataFrame | Series:
context_manager = com.temp_setattr(obj, "as_index", True)
else:
context_manager = nullcontext()

def include_axis(colg) -> bool:
return isinstance(colg, ABCDataFrame) or (
isinstance(colg, ABCSeries) and op_name == "agg"
)

with context_manager:
# degenerate case
if selected_obj.ndim == 1:
for a in func:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
new_res = colg.aggregate(a, *this_args, **self.kwargs)
args = [self.axis, *self.args] if include_axis(colg) else self.args
new_res = getattr(colg, op_name)(a, *args, **kwargs)
results.append(new_res)

# make sure we find a good name
Expand All @@ -339,7 +353,8 @@ def agg_list_like(self) -> DataFrame | Series:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
new_res = colg.aggregate(func, *this_args, **self.kwargs)
args = [self.axis, *self.args] if include_axis(colg) else self.args
new_res = getattr(colg, op_name)(func, *args, **kwargs)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)
Expand All @@ -366,15 +381,21 @@ def agg_dict_like(self) -> DataFrame | Series:
-------
Result of aggregation.
"""
return self._apply_dict_like(op_name="agg")

def _apply_dict_like(self, op_name: Literal["agg", "apply"]) -> DataFrame | Series:
from pandas import Index
from pandas.core.groupby.generic import (
DataFrameGroupBy,
SeriesGroupBy,
)
from pandas.core.reshape.concat import concat

assert op_name in ["agg", "apply"]

obj = self.obj
func = cast(AggFuncTypeDict, self.func)
kwds = {"by_row": False} if op_name == "apply" else {}

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")
Expand All @@ -387,7 +408,7 @@ def agg_dict_like(self) -> DataFrame | Series:
selected_obj = obj._selected_obj
selection = obj._selection

func = self.normalize_dictlike_arg("agg", selected_obj, func)
func = self.normalize_dictlike_arg(op_name, selected_obj, func)

is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
context_manager: ContextManager
Expand All @@ -407,7 +428,9 @@ def agg_dict_like(self) -> DataFrame | Series:
if selected_obj.ndim == 1:
# key only used for output
colg = obj._gotitem(selection, ndim=1)
result_data = [colg.agg(how) for _, how in func.items()]
result_data = [
getattr(colg, op_name)(how, **kwds) for _, how in func.items()
]
result_index = list(func.keys())
elif is_non_unique_col:
# key used for column selection and output
Expand All @@ -422,7 +445,7 @@ def agg_dict_like(self) -> DataFrame | Series:
label_to_indices[label].append(index)

key_data = [
selected_obj._ixs(indice, axis=1).agg(how)
getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **kwds)
for label, indices in label_to_indices.items()
for indice in indices
]
Expand All @@ -432,7 +455,8 @@ def agg_dict_like(self) -> DataFrame | Series:
else:
# key used for column selection and output
result_data = [
obj._gotitem(key, ndim=1).agg(how) for key, how in func.items()
getattr(obj._gotitem(key, ndim=1), op_name)(how)
for key, how in func.items()
]
result_index = list(func.keys())

Expand Down Expand Up @@ -527,7 +551,7 @@ def apply_str(self) -> DataFrame | Series:
self.kwargs["axis"] = self.axis
return self._apply_str(obj, func, *self.args, **self.kwargs)

def apply_multiple(self) -> DataFrame | Series:
def apply_list_or_dict_like(self) -> DataFrame | Series:
"""
Compute apply in case of a list-like or dict-like.

Expand All @@ -543,9 +567,9 @@ def apply_multiple(self) -> DataFrame | Series:
kwargs = self.kwargs

if is_dict_like(func):
result = self.agg_dict_like()
result = self._apply_dict_like(op_name="apply")
else:
result = self.agg_list_like()
result = self.agg_or_apply_list_like(op_name="apply")

result = reconstruct_and_relabel_result(result, func, **kwargs)

Expand Down Expand Up @@ -685,8 +709,8 @@ def values(self):
def apply(self) -> DataFrame | Series:
"""compute the results"""
# dispatch to agg
if is_list_like(self.func):
return self.apply_multiple()
if is_list_like(self.func) or is_dict_like(self.func):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dicts are considered list-like; no need for the 2nd check here.

Copy link
Contributor Author

@topper-123 topper-123 Jun 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I changed it. I've changed the comment above instead to explain dictlike go here too.

return self.apply_list_or_dict_like()

# all empty
if len(self.columns) == 0 and len(self.index) == 0:
Expand Down Expand Up @@ -1033,13 +1057,15 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
class SeriesApply(NDFrameApply):
obj: Series
axis: AxisInt = 0
by_row: bool # only relevant for apply()

def __init__(
self,
obj: Series,
func: AggFuncType,
*,
convert_dtype: bool | lib.NoDefault = lib.no_default,
by_row: bool = True,
args,
kwargs,
) -> None:
Expand All @@ -1054,6 +1080,7 @@ def __init__(
stacklevel=find_stack_level(),
)
self.convert_dtype = convert_dtype
self.by_row = by_row

super().__init__(
obj,
Expand All @@ -1071,8 +1098,8 @@ def apply(self) -> DataFrame | Series:
return self.apply_empty_result()

# dispatch to agg
if is_list_like(self.func):
return self.apply_multiple()
if is_list_like(self.func) or is_dict_like(self.func):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, changed.

return self.apply_list_or_dict_like()

if isinstance(self.func, str):
# if we are a string, try to dispatch
Expand Down Expand Up @@ -1118,6 +1145,8 @@ def apply_standard(self) -> DataFrame | Series:
if isinstance(func, np.ufunc):
with np.errstate(all="ignore"):
return func(obj, *self.args, **self.kwargs)
elif not self.by_row:
return func(obj, *self.args, **self.kwargs)

if self.args or self.kwargs:
# _map_values does not support args/kwargs
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/series.py
Expand Up @@ -4492,6 +4492,8 @@ def apply(
func: AggFuncType,
convert_dtype: bool | lib.NoDefault = lib.no_default,
args: tuple[Any, ...] = (),
*,
by_row: bool = True,
**kwargs,
) -> DataFrame | Series:
"""
Expand Down Expand Up @@ -4519,6 +4521,12 @@ def apply(
instead if you want ``convert_dtype=False``.
args : tuple
Positional arguments passed to func after the series value.
by_row : bool, default True
If False, the func will be passed the whole Series at once.
If True, will func will be passed each element of the Series, like
Series.map (backward compatible).

.. versionadded:: 2.1.0
**kwargs
Additional keyword arguments passed to func.

Expand Down Expand Up @@ -4607,7 +4615,12 @@ def apply(
dtype: float64
"""
return SeriesApply(
self, func, convert_dtype=convert_dtype, args=args, kwargs=kwargs
self,
func,
convert_dtype=convert_dtype,
by_row=by_row,
args=args,
kwargs=kwargs,
).apply()

def _reindex_indexer(
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/apply/test_series_apply.py
Expand Up @@ -535,6 +535,20 @@ def test_apply_listlike_transformer(string_series, ops, names):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"ops, expected",
[
([lambda x: x], DataFrame({"<lambda>": [1, 2, 3]})),
([lambda x: x.sum()], Series([6], index=["<lambda>"])),
],
)
def test_apply_listlike_lambda(ops, expected):
# GH53400
ser = Series([1, 2, 3])
result = ser.apply(ops)
tm.assert_equal(result, expected)


@pytest.mark.parametrize(
"ops",
[
Expand Down