-
-
Notifications
You must be signed in to change notification settings - Fork 17.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG/PERF: groupby.transform with unobserved categories #58084
Changes from 7 commits
a52e7fe
898fd12
5311004
fb548ad
baa1b28
30013ee
3b9d27b
4221c34
8588a1e
8e669d9
0d9f89d
84f83ae
cbabce0
bcca14f
f3a3f63
58e759f
7f99b71
4364440
5d63405
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -588,6 +588,8 @@ class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin): | |
"obj", | ||
"observed", | ||
"sort", | ||
"observed_grouper", | ||
"observed_exclusions", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of this, I recommend adding it as a cached method on the
For this to work, you also need to do the same to Grouping:
and use the observed_groupings in the BaseGrouper call above. For BinGrouper, I think you can just always return Also, you can ignore There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah my bad, I have made the changes as suggested |
||
} | ||
|
||
_grouper: ops.BaseGrouper | ||
|
@@ -1106,6 +1108,8 @@ def __init__( | |
group_keys: bool = True, | ||
observed: bool = False, | ||
dropna: bool = True, | ||
observed_grouper: ops.BaseGrouper | None = None, | ||
observed_exclusions: frozenset[Hashable] | None = None, | ||
) -> None: | ||
self._selection = selection | ||
|
||
|
@@ -1118,6 +1122,21 @@ def __init__( | |
self.group_keys = group_keys | ||
self.dropna = dropna | ||
|
||
if not observed and grouper is None: | ||
observed_grouper, observed_exclusions, _ = get_grouper( | ||
obj, | ||
self.keys, | ||
level=self.level, | ||
sort=self.sort, | ||
observed=True, | ||
dropna=self.dropna, | ||
) | ||
|
||
self.observed_grouper = observed_grouper | ||
self.observed_exclusions = ( | ||
frozenset(observed_exclusions) if observed_exclusions else frozenset() | ||
) | ||
|
||
if grouper is None: | ||
grouper, exclusions, obj = get_grouper( | ||
obj, | ||
|
@@ -1879,24 +1898,41 @@ def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): | |
|
||
else: | ||
# i.e. func in base.reduction_kernels | ||
if self.observed: | ||
return self._reduction_kernel_transform( | ||
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs | ||
) | ||
|
||
# GH#30918 Use _transform_fast only when we know func is an aggregation | ||
# If func is a reduction, we need to broadcast the | ||
# result to the whole group. Compute func result | ||
# and deal with possible broadcasting below. | ||
with com.temp_setattr(self, "as_index", True): | ||
# GH#49834 - result needs groups in the index for | ||
# _wrap_transform_fast_result | ||
if func in ["idxmin", "idxmax"]: | ||
func = cast(Literal["idxmin", "idxmax"], func) | ||
result = self._idxmax_idxmin(func, True, *args, **kwargs) | ||
else: | ||
if engine is not None: | ||
kwargs["engine"] = engine | ||
kwargs["engine_kwargs"] = engine_kwargs | ||
result = getattr(self, func)(*args, **kwargs) | ||
with ( | ||
com.temp_setattr(self, "observed", True), | ||
com.temp_setattr(self, "_grouper", self.observed_grouper), | ||
com.temp_setattr(self, "exclusions", self.observed_exclusions), | ||
): | ||
return self._reduction_kernel_transform( | ||
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs | ||
) | ||
|
||
@final | ||
def _reduction_kernel_transform( | ||
self, func, *args, engine=None, engine_kwargs=None, **kwargs | ||
): | ||
# GH#30918 Use _transform_fast only when we know func is an aggregation | ||
# If func is a reduction, we need to broadcast the | ||
# result to the whole group. Compute func result | ||
# and deal with possible broadcasting below. | ||
with com.temp_setattr(self, "as_index", True): | ||
# GH#49834 - result needs groups in the index for | ||
# _wrap_transform_fast_result | ||
if func in ["idxmin", "idxmax"]: | ||
func = cast(Literal["idxmin", "idxmax"], func) | ||
result = self._idxmax_idxmin(func, True, *args, **kwargs) | ||
else: | ||
if engine is not None: | ||
kwargs["engine"] = engine | ||
kwargs["engine_kwargs"] = engine_kwargs | ||
result = getattr(self, func)(*args, **kwargs) | ||
|
||
return self._wrap_transform_fast_result(result) | ||
return self._wrap_transform_fast_result(result) | ||
|
||
@final | ||
def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1232,9 +1232,9 @@ def test_categorical_and_not_categorical_key(observed): | |
tm.assert_frame_equal(result, expected_explicit) | ||
|
||
# Series case | ||
result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform( | ||
"sum" | ||
) | ||
gb = df_with_categorical.groupby(["A", "C"], observed=observed) | ||
gbp = gb["B"] | ||
result = gbp.transform("sum") | ||
expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") | ||
tm.assert_series_equal(result, expected) | ||
expected_explicit = Series([4, 2, 4], name="B") | ||
|
@@ -1535,3 +1535,151 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels(): | |
result = df.groupby(series, as_index=False).transform("sum") | ||
expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]}) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
|
||
# GH#58084 | ||
def test_min_one_unobserved_category_no_type_coercion(): | ||
df = DataFrame({"A": Categorical([1, 1, 2], categories=[1, 2, 3]), "B": [3, 4, 5]}) | ||
undermyumbrella1 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
df["B"] = df["B"].astype("int32") | ||
gb = df.groupby("A", observed=False) | ||
result = gb.transform("min") | ||
|
||
expected = DataFrame({"B": [3, 3, 5]}, dtype="int32") | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
# GH#58084 | ||
def test_min_multiple_unobserved_categories_no_type_coercion(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems redundant to me - I think the above test is sufficient here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. resolved |
||
df = DataFrame( | ||
{ | ||
"X": Categorical( | ||
["432945", "randomcat", -4325466, "randomcat", -4325466, -4325466], | ||
categories=[ | ||
1, | ||
"randomcat", | ||
100, | ||
333, | ||
"cat43543", | ||
-4325466, | ||
54665, | ||
-546767, | ||
"432945", | ||
767076, | ||
], | ||
), | ||
"Y": [0, 940645, np.iinfo(np.int64).min, 9449, 100044444, 40], | ||
} | ||
) | ||
df["Y"] = df["Y"].astype("int64") | ||
|
||
gb = df.groupby("X", observed=False) | ||
result = gb.transform("min") | ||
|
||
expected = DataFrame( | ||
{ | ||
"Y": [ | ||
0, | ||
9449, | ||
np.iinfo(np.int64).min, | ||
9449, | ||
np.iinfo(np.int64).min, | ||
np.iinfo(np.int64).min, | ||
] | ||
}, | ||
dtype="int64", | ||
) | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
# GH#58084 | ||
def test_min_float32_multiple_unobserved_categories_no_type_coercion(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you instead parametrize
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. resolved |
||
df = DataFrame( | ||
{ | ||
"X": Categorical( | ||
["cat43543", -4325466, 54665, "cat43543", -4325466, 54665], | ||
categories=[ | ||
1, | ||
"randomcat", | ||
100, | ||
333, | ||
"cat43543", | ||
-4325466, | ||
54665, | ||
-546767, | ||
"432945", | ||
767076, | ||
], | ||
), | ||
"Y": [ | ||
0.3940429, | ||
940645.49, | ||
np.finfo(np.float32).min, | ||
9449.03333, | ||
100044444.403294, | ||
40.3020909, | ||
], | ||
} | ||
) | ||
df["Y"] = df["Y"].astype("float32") | ||
|
||
gb = df.groupby("X", observed=False) | ||
result = gb.transform("min") | ||
|
||
expected = DataFrame( | ||
{ | ||
"Y": [ | ||
0.3940429, | ||
940645.49, | ||
np.finfo(np.float32).min, | ||
0.3940429, | ||
940645.49, | ||
np.finfo(np.float32).min, | ||
] | ||
}, | ||
dtype="float32", | ||
) | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
# GH#58084 | ||
def test_min_all_empty_data_no_type_coercion(): | ||
df = DataFrame( | ||
{ | ||
"X": Categorical( | ||
[], | ||
categories=[ | ||
1, | ||
"randomcat", | ||
100, | ||
333, | ||
"cat43543", | ||
-4325466, | ||
54665, | ||
-546767, | ||
"432945", | ||
767076, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think there is a need for so many here - can you make it 1-3 categories (so the test is more compact). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. resolved |
||
], | ||
), | ||
"Y": [], | ||
} | ||
) | ||
df["Y"] = df["Y"].astype("int32") | ||
|
||
gb = df.groupby("X", observed=False) | ||
result = gb.transform("min") | ||
|
||
expected = DataFrame({"Y": []}, dtype="int32") | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
# GH#58084 | ||
def test_min_one_dim_no_type_coercion(): | ||
df = DataFrame({"Y": [9435, -5465765, 5055, 0, 954960]}) | ||
df["Y"] = df["Y"].astype("int32") | ||
categories = Categorical([1, 2, 2, 5, 1], categories=[1, 2, 3, 4, 5]) | ||
|
||
gb = df.groupby(categories, observed=False) | ||
result = gb.transform("min") | ||
|
||
expected = DataFrame({"Y": [9435, -5465765, -5465765, 0, 9435]}, dtype="int32") | ||
tm.assert_frame_equal(expected, result) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you revert this line addition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This still appears in the diff of this PR.