Skip to content
15 changes: 15 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,32 @@ did not have the same index as the input.

.. code-block:: ipython

In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum())
Out[3]:
b
0 5
1 5

In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
Out[3]:
b
0 2
1 3

In [3]: df.groupby('a', dropna=True).transform('sum')
Out[3]:
b
0 5
1 5
2 5

*New behavior*:

.. ipython:: python

df.groupby('a', dropna=True).transform(lambda x: x.sum())
df.groupby('a', dropna=True).transform(lambda x: x)
df.groupby('a', dropna=True).transform('sum')

.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:

Expand Down
20 changes: 18 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3693,10 +3693,26 @@ class max_speed

nv.validate_take((), kwargs)

return self._take(indices, axis)

def _take(
self: NDFrameT,
indices,
axis=0,
convert_indices: bool_t = True,
) -> NDFrameT:
"""
Internal version of the `take` allowing specification of additional args.

See the docstring of `take` for full explanation of the parameters.
"""
self._consolidate_inplace()

new_data = self._mgr.take(
indices, axis=self._get_block_manager_axis(axis), verify=True
indices,
axis=self._get_block_manager_axis(axis),
verify=True,
convert_indices=convert_indices,
)
return self._constructor(new_data).__finalize__(self, method="take")

Expand All @@ -3708,7 +3724,7 @@ def _take_with_is_copy(self: NDFrameT, indices, axis=0) -> NDFrameT:

See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis)
result = self._take(indices=indices, axis=axis)
# Maybe set copy if we didn't actually change the index.
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1646,7 +1646,10 @@ def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
out = algorithms.take_nd(result._values, ids)
output = obj._constructor(out, index=obj.index, name=obj.name)
else:
output = result.take(ids, axis=self.axis)
# GH#46209
# Don't convert indices: negative indices need to give rise
# to null values in the result
output = result._take(ids, axis=self.axis, convert_indices=False)
output = output.set_axis(obj._get_axis(self.axis), axis=self.axis)
return output

Expand Down Expand Up @@ -1699,9 +1702,14 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
else:
out = np.repeat(out[np.r_[run[1:], True]], rep) - out

if self.grouper.has_dropped_na:
out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
else:
out = out.astype(np.int64, copy=False)

rev = np.empty(count, dtype=np.intp)
rev[sorter] = np.arange(count, dtype=np.intp)
return out[rev].astype(np.int64, copy=False)
return out[rev]

# -----------------------------------------------------------------

Expand Down
11 changes: 9 additions & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,13 @@ def _reindex_indexer(

return type(self)(new_arrays, new_axes, verify_integrity=False)

def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
def take(
self: T,
indexer,
axis: int = 1,
verify: bool = True,
convert_indices: bool = True,
) -> T:
"""
Take items along any axis.
"""
Expand All @@ -656,7 +662,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
raise ValueError("indexer should be 1-dimensional")

n = self.shape_proper[axis]
indexer = maybe_convert_indices(indexer, n, verify=verify)
if convert_indices:
indexer = maybe_convert_indices(indexer, n, verify=verify)

new_labels = self._axes[axis].take(indexer)
return self._reindex_indexer(
Expand Down
13 changes: 11 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,7 +829,13 @@ def _make_na_block(
block_values.fill(fill_value)
return new_block_2d(block_values, placement=placement)

def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
def take(
self: T,
indexer,
axis: int = 1,
verify: bool = True,
convert_indices: bool = True,
) -> T:
"""
Take items along any axis.

Expand All @@ -838,6 +844,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
verify : bool, default True
Check that all entries are between 0 and len(self) - 1, inclusive.
Pass verify=False if this check has been done by the caller.
convert_indices : bool, default True
Whether to attempt to convert indices to positive values.

Returns
-------
Expand All @@ -851,7 +859,8 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
)

n = self.shape[axis]
indexer = maybe_convert_indices(indexer, n, verify=verify)
if convert_indices:
indexer = maybe_convert_indices(indexer, n, verify=verify)

new_labels = self.axes[axis].take(indexer)
return self.reindex_indexer(
Expand Down
193 changes: 193 additions & 0 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,3 +1308,196 @@ def test_null_group_lambda_self(sort, dropna):
gb = df.groupby("A", dropna=dropna, sort=sort)
result = gb.transform(lambda x: x)
tm.assert_frame_equal(result, expected)


def test_null_group_str_reducer(request, dropna, reduction_func):
# GH 17093
if reduction_func in ("corrwith", "ngroup"):
msg = "incorrectly raises"
request.node.add_marker(pytest.mark.xfail(reason=msg))
index = [1, 2, 3, 4] # test transform preserves non-standard index
df = DataFrame({"A": [1, 1, np.nan, np.nan], "B": [1, 2, 2, 3]}, index=index)
gb = df.groupby("A", dropna=dropna)

if reduction_func == "corrwith":
args = (df["B"],)
elif reduction_func == "nth":
args = (0,)
else:
args = ()

# Manually handle reducers that don't fit the generic pattern
# Set expected with dropna=False, then replace if necessary
if reduction_func == "first":
expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
elif reduction_func == "last":
expected = DataFrame({"B": [2, 2, 3, 3]}, index=index)
elif reduction_func == "nth":
expected = DataFrame({"B": [1, 1, 2, 2]}, index=index)
elif reduction_func == "size":
expected = Series([2, 2, 2, 2], index=index)
elif reduction_func == "corrwith":
expected = DataFrame({"B": [1.0, 1.0, 1.0, 1.0]}, index=index)
else:
expected_gb = df.groupby("A", dropna=False)
buffer = []
for idx, group in expected_gb:
res = getattr(group["B"], reduction_func)()
buffer.append(Series(res, index=group.index))
expected = concat(buffer).to_frame("B")
if dropna:
dtype = object if reduction_func in ("any", "all") else float
expected = expected.astype(dtype)
if expected.ndim == 2:
expected.iloc[[2, 3], 0] = np.nan
else:
expected.iloc[[2, 3]] = np.nan

result = gb.transform(reduction_func, *args)
tm.assert_equal(result, expected)


def test_null_group_str_transformer(
request, using_array_manager, dropna, transformation_func
):
# GH 17093
xfails_block = (
"cummax",
"cummin",
"cumsum",
"fillna",
"rank",
"backfill",
"ffill",
"bfill",
"pad",
)
xfails_array = ("cummax", "cummin", "cumsum", "fillna", "rank")
if transformation_func == "tshift":
msg = "tshift requires timeseries"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif dropna and (
(not using_array_manager and transformation_func in xfails_block)
or (using_array_manager and transformation_func in xfails_array)
):
msg = "produces incorrect results when nans are present"
request.node.add_marker(pytest.mark.xfail(reason=msg))
args = (0,) if transformation_func == "fillna" else ()
df = DataFrame({"A": [1, 1, np.nan], "B": [1, 2, 2]}, index=[1, 2, 3])
gb = df.groupby("A", dropna=dropna)

buffer = []
for k, (idx, group) in enumerate(gb):
if transformation_func == "cumcount":
# DataFrame has no cumcount method
res = DataFrame({"B": range(len(group))}, index=group.index)
elif transformation_func == "ngroup":
res = DataFrame(len(group) * [k], index=group.index, columns=["B"])
else:
res = getattr(group[["B"]], transformation_func)(*args)
buffer.append(res)
if dropna:
dtype = object if transformation_func in ("any", "all") else None
buffer.append(DataFrame([[np.nan]], index=[3], dtype=dtype, columns=["B"]))
expected = concat(buffer)

if transformation_func in ("cumcount", "ngroup"):
# ngroup/cumcount always returns a Series as it counts the groups, not values
expected = expected["B"].rename(None)

warn = FutureWarning if transformation_func in ("backfill", "pad") else None
msg = f"{transformation_func} is deprecated"
with tm.assert_produces_warning(warn, match=msg):
result = gb.transform(transformation_func, *args)

tm.assert_equal(result, expected)


def test_null_group_str_reducer_series(request, dropna, reduction_func):
# GH 17093
if reduction_func == "corrwith":
msg = "corrwith not implemented for SeriesGroupBy"
request.node.add_marker(pytest.mark.xfail(reason=msg))

if reduction_func == "ngroup":
msg = "ngroup fails"
request.node.add_marker(pytest.mark.xfail(reason=msg))

# GH 17093
index = [1, 2, 3, 4] # test transform preserves non-standard index
ser = Series([1, 2, 2, 3], index=index)
gb = ser.groupby([1, 1, np.nan, np.nan], dropna=dropna)

if reduction_func == "corrwith":
args = (ser,)
elif reduction_func == "nth":
args = (0,)
else:
args = ()

# Manually handle reducers that don't fit the generic pattern
# Set expected with dropna=False, then replace if necessary
if reduction_func == "first":
expected = Series([1, 1, 2, 2], index=index)
elif reduction_func == "last":
expected = Series([2, 2, 3, 3], index=index)
elif reduction_func == "nth":
expected = Series([1, 1, 2, 2], index=index)
elif reduction_func == "size":
expected = Series([2, 2, 2, 2], index=index)
elif reduction_func == "corrwith":
expected = Series([1, 1, 2, 2], index=index)
else:
expected_gb = ser.groupby([1, 1, np.nan, np.nan], dropna=False)
buffer = []
for idx, group in expected_gb:
res = getattr(group, reduction_func)()
buffer.append(Series(res, index=group.index))
expected = concat(buffer)
if dropna:
dtype = object if reduction_func in ("any", "all") else float
expected = expected.astype(dtype)
expected.iloc[[2, 3]] = np.nan

result = gb.transform(reduction_func, *args)
tm.assert_series_equal(result, expected)


def test_null_group_str_transformer_series(request, dropna, transformation_func):
# GH 17093
if transformation_func == "tshift":
msg = "tshift requires timeseries"
request.node.add_marker(pytest.mark.xfail(reason=msg))
elif dropna and transformation_func in (
"cummax",
"cummin",
"cumsum",
"fillna",
"rank",
):
msg = "produces incorrect results when nans are present"
request.node.add_marker(pytest.mark.xfail(reason=msg))
args = (0,) if transformation_func == "fillna" else ()
ser = Series([1, 2, 2], index=[1, 2, 3])
gb = ser.groupby([1, 1, np.nan], dropna=dropna)

buffer = []
for k, (idx, group) in enumerate(gb):
if transformation_func == "cumcount":
# Series has no cumcount method
res = Series(range(len(group)), index=group.index)
elif transformation_func == "ngroup":
res = Series(k, index=group.index)
else:
res = getattr(group, transformation_func)(*args)
buffer.append(res)
if dropna:
dtype = object if transformation_func in ("any", "all") else None
buffer.append(Series([np.nan], index=[3], dtype=dtype))
expected = concat(buffer)

warn = FutureWarning if transformation_func in ("backfill", "pad") else None
msg = f"{transformation_func} is deprecated"
with tm.assert_produces_warning(warn, match=msg):
result = gb.transform(transformation_func, *args)
tm.assert_equal(result, expected)