Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] Sum of grouped bool has inconsistent dtype #32894

Merged
merged 7 commits into from
Mar 26, 2020
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,8 @@ Groupby/resample/rolling

- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`)
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`)


Reshaping
^^^^^^^^^
Expand Down
2 changes: 0 additions & 2 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
ExtensionArray,
ExtensionOpsMixin,
ExtensionScalarOpsMixin,
try_cast_to_ea,
)
from pandas.core.arrays.boolean import BooleanArray
from pandas.core.arrays.categorical import Categorical
Expand All @@ -19,7 +18,6 @@
"ExtensionArray",
"ExtensionOpsMixin",
"ExtensionScalarOpsMixin",
"try_cast_to_ea",
"BooleanArray",
"Categorical",
"DatetimeArray",
Expand Down
26 changes: 2 additions & 24 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pandas.util._decorators import Appender, Substitution
from pandas.util._validators import validate_fillna_kwargs

from pandas.core.dtypes.cast import maybe_cast_to_extension_array
from pandas.core.dtypes.common import is_array_like, is_list_like
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
Expand All @@ -32,29 +33,6 @@
_extension_array_shared_docs: Dict[str, str] = dict()


def try_cast_to_ea(cls_or_instance, obj, dtype=None):
"""
Call to `_from_sequence` that returns the object unchanged on Exception.

Parameters
----------
cls_or_instance : ExtensionArray subclass or instance
obj : arraylike
Values to pass to cls._from_sequence
dtype : ExtensionDtype, optional

Returns
-------
ExtensionArray or obj
"""
try:
result = cls_or_instance._from_sequence(obj, dtype=dtype)
except Exception:
# We can't predict what downstream EA constructors may raise
result = obj
return result


class ExtensionArray:
"""
Abstract base class for custom 1-D array types.
Expand Down Expand Up @@ -1214,7 +1192,7 @@ def _maybe_convert(arr):
# https://github.com/pandas-dev/pandas/issues/22850
# We catch all regular exceptions here, and fall back
# to an ndarray.
res = try_cast_to_ea(self, arr)
res = maybe_cast_to_extension_array(type(self), arr)
if not isinstance(res, type(self)):
# exception raised in _from_sequence; ensure we have ndarray
res = np.asarray(arr)
Expand Down
14 changes: 7 additions & 7 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
)
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs

from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike
from pandas.core.dtypes.cast import (
coerce_indexer_dtype,
maybe_cast_to_extension_array,
maybe_infer_to_datetimelike,
)
from pandas.core.dtypes.common import (
ensure_int64,
ensure_object,
Expand Down Expand Up @@ -47,11 +51,7 @@
from pandas.core.accessor import PandasDelegate, delegate_names
import pandas.core.algorithms as algorithms
from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d
from pandas.core.arrays.base import (
ExtensionArray,
_extension_array_shared_docs,
try_cast_to_ea,
)
from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs
from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
import pandas.core.common as com
from pandas.core.construction import array, extract_array, sanitize_array
Expand Down Expand Up @@ -2568,7 +2568,7 @@ def _get_codes_for_values(values, categories):
# scalar objects. e.g.
# Categorical(array[Period, Period], categories=PeriodIndex(...))
cls = categories.dtype.construct_array_type()
values = try_cast_to_ea(cls, values)
values = maybe_cast_to_extension_array(cls, values)
if not isinstance(values, cls):
# exception raised in _from_sequence
values = ensure_object(values)
Expand Down
93 changes: 92 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
iNaT,
)
from pandas._libs.tslibs.timezones import tz_compare
from pandas._typing import Dtype
from pandas._typing import Dtype, DtypeObj
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -246,6 +246,97 @@ def trans(x):
return result


def maybe_cast_result(
result, obj: ABCSeries, numeric_only: bool = False, how: str = ""
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
):
"""
Try casting result to a different type if appropriate

Parameters
----------
result : array-like
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
Result to cast.
obj : ABCSeries
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
Input series from which result was calculated.
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
numeric_only : bool, default False
Whether to cast only numerics or datetimes as well.
how : str, default ""
How the result was computed.

Returns
-------
result : array-like
result maybe casted to the dtype.
"""
if obj.ndim > 1:
dtype = obj._values.dtype
else:
dtype = obj.dtype
dtype = maybe_cast_result_dtype(dtype, how)

if not is_scalar(result):
if is_extension_array_dtype(dtype) and dtype.kind != "M":
# The result may be of any type, cast back to original
# type if it's compatible.
if len(result) and isinstance(result[0], dtype.type):
cls = dtype.construct_array_type()
result = maybe_cast_to_extension_array(cls, result, dtype=dtype)

elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

return result


def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
"""
Get the desired dtype of a result based on the
input dtype and how it was computed.

Parameters
----------
dtype : DtypeObj
Input dtype.
how : str
How the result was computed.

Returns
-------
DtypeObj
The desired dtype of the result.
"""
d = {
(np.dtype(np.bool), "add"): np.dtype(np.int64),
(np.dtype(np.bool), "cumsum"): np.dtype(np.int64),
(np.dtype(np.bool), "sum"): np.dtype(np.int64),
}
return d.get((dtype, how), dtype)


def maybe_cast_to_extension_array(cls, obj, dtype=None):
"""
Call to `_from_sequence` that returns the object unchanged on Exception.

Parameters
----------
cls : ExtensionArray subclass
obj : arraylike
Values to pass to cls._from_sequence
dtype : ExtensionDtype, optional

Returns
-------
ExtensionArray or obj
"""
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(cls, type), f"must pass a type: {cls}"
try:
result = cls._from_sequence(obj, dtype=dtype)
except Exception:
# We can't predict what downstream EA constructors may raise
result = obj
return result


def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other):
"""
A safe version of putmask that potentially upcasts the result.
Expand Down
14 changes: 9 additions & 5 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
from pandas.util._decorators import Appender, Substitution

from pandas.core.dtypes.cast import (
maybe_cast_result,
maybe_cast_result_dtype,
maybe_convert_objects,
maybe_downcast_numeric,
maybe_downcast_to_dtype,
Expand Down Expand Up @@ -526,7 +528,7 @@ def _transform_fast(self, result, func_nm: str) -> Series:
cast = self._transform_should_cast(func_nm)
out = algorithms.take_1d(result._values, ids)
if cast:
out = self._try_cast(out, self.obj)
out = maybe_cast_result(out, self.obj, how=func_nm)
return Series(out, index=self.obj.index, name=self.obj.name)

def filter(self, func, dropna=True, *args, **kwargs):
Expand Down Expand Up @@ -1072,8 +1074,10 @@ def _cython_agg_blocks(
assert not isinstance(result, DataFrame)

if result is not no_result:
# see if we can cast the block back to the original dtype
result = maybe_downcast_numeric(result, block.dtype)
# see if we can cast the block to the desired dtype
# this may not be the original dtype
dtype = maybe_cast_result_dtype(block.dtype, how)
result = maybe_downcast_numeric(result, dtype)

if block.is_extension and isinstance(result, np.ndarray):
# e.g. block.values was an IntegerArray
Expand Down Expand Up @@ -1175,7 +1179,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:

else:
if cast:
result[item] = self._try_cast(result[item], data)
result[item] = maybe_cast_result(result[item], data)

result_columns = obj.columns
if cannot_agg:
Expand Down Expand Up @@ -1460,7 +1464,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame:
# TODO: we have no test cases that get here with EA dtypes;
# try_cast may not be needed if EAs never get here
if cast:
res = self._try_cast(res, obj.iloc[:, i])
res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm)
output.append(res)

return DataFrame._from_arrays(output, columns=result.columns, index=obj.index)
Expand Down
45 changes: 7 additions & 38 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,10 @@ class providing the base-class of operations.
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, cache_readonly

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.cast import maybe_cast_result
from pandas.core.dtypes.common import (
ensure_float,
is_datetime64_dtype,
is_extension_array_dtype,
is_integer_dtype,
is_numeric_dtype,
is_object_dtype,
Expand All @@ -53,7 +52,7 @@ class providing the base-class of operations.

from pandas.core import nanops
import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea
from pandas.core.arrays import Categorical, DatetimeArray
from pandas.core.base import DataError, PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -792,36 +791,6 @@ def _cumcount_array(self, ascending: bool = True):
rev[sorter] = np.arange(count, dtype=np.intp)
return out[rev].astype(np.int64, copy=False)

def _try_cast(self, result, obj, numeric_only: bool = False):
"""
Try to cast the result to our obj original type,
we may have roundtripped through object in the mean-time.

If numeric_only is True, then only try to cast numerics
and not datetimelikes.

"""
if obj.ndim > 1:
dtype = obj._values.dtype
else:
dtype = obj.dtype

if not is_scalar(result):
if is_extension_array_dtype(dtype) and dtype.kind != "M":
# The function can return something of any type, so check
# if the type is compatible with the calling EA.
# datetime64tz is handled correctly in agg_series,
# so is excluded here.

if len(result) and isinstance(result[0], dtype.type):
cls = dtype.construct_array_type()
result = try_cast_to_ea(cls, result, dtype=dtype)

elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

return result

def _transform_should_cast(self, func_nm: str) -> bool:
"""
Parameters
Expand Down Expand Up @@ -852,7 +821,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):
continue

if self._transform_should_cast(how):
result = self._try_cast(result, obj)
result = maybe_cast_result(result, obj, how=how)

key = base.OutputKey(label=name, position=idx)
output[key] = result
Expand Down Expand Up @@ -895,12 +864,12 @@ def _cython_agg_general(
assert len(agg_names) == result.shape[1]
for result_column, result_name in zip(result.T, agg_names):
key = base.OutputKey(label=result_name, position=idx)
output[key] = self._try_cast(result_column, obj)
output[key] = maybe_cast_result(result_column, obj, how=how)
idx += 1
else:
assert result.ndim == 1
key = base.OutputKey(label=name, position=idx)
output[key] = self._try_cast(result, obj)
output[key] = maybe_cast_result(result, obj, how=how)
idx += 1

if len(output) == 0:
Expand Down Expand Up @@ -929,7 +898,7 @@ def _python_agg_general(self, func, *args, **kwargs):

assert result is not None
key = base.OutputKey(label=name, position=idx)
output[key] = self._try_cast(result, obj, numeric_only=True)
output[key] = maybe_cast_result(result, obj, numeric_only=True)

if len(output) == 0:
return self._python_apply_general(f)
Expand All @@ -944,7 +913,7 @@ def _python_agg_general(self, func, *args, **kwargs):
if is_numeric_dtype(values.dtype):
values = ensure_float(values)

output[key] = self._try_cast(values[mask], result)
output[key] = maybe_cast_result(values[mask], result)

return self._wrap_aggregated_output(output)

Expand Down
10 changes: 7 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@
from pandas.util._decorators import Appender, Substitution, doc
from pandas.util._validators import validate_bool_kwarg, validate_percentile

from pandas.core.dtypes.cast import convert_dtypes, validate_numeric_casting
from pandas.core.dtypes.cast import (
convert_dtypes,
maybe_cast_to_extension_array,
validate_numeric_casting,
)
from pandas.core.dtypes.common import (
_is_unorderable_exception,
ensure_platform_int,
Expand Down Expand Up @@ -59,7 +63,7 @@
import pandas as pd
from pandas.core import algorithms, base, generic, nanops, ops
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import ExtensionArray, try_cast_to_ea
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.categorical import CategoricalAccessor
from pandas.core.arrays.sparse import SparseAccessor
import pandas.core.common as com
Expand Down Expand Up @@ -2721,7 +2725,7 @@ def combine(self, other, func, fill_value=None) -> "Series":
# TODO: can we do this for only SparseDtype?
# The function can return something of any type, so check
# if the type is compatible with the calling EA.
new_values = try_cast_to_ea(self._values, new_values)
new_values = maybe_cast_to_extension_array(type(self._values), new_values)
return self._constructor(new_values, index=new_index, name=new_name)

def combine_first(self, other) -> "Series":
Expand Down
Loading