pandas-dev · jreback · Mar 26, 2020 · Mar 21, 2020 · Mar 22, 2020 · Mar 24, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -370,6 +370,8 @@ Groupby/resample/rolling
 
 - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
 - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`)
+- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`)
+
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -2,7 +2,6 @@
     ExtensionArray,
     ExtensionOpsMixin,
     ExtensionScalarOpsMixin,
-    try_cast_to_ea,
 )
 from pandas.core.arrays.boolean import BooleanArray
 from pandas.core.arrays.categorical import Categorical
@@ -19,7 +18,6 @@
     "ExtensionArray",
     "ExtensionOpsMixin",
     "ExtensionScalarOpsMixin",
-    "try_cast_to_ea",
     "BooleanArray",
     "Categorical",
     "DatetimeArray",

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -19,6 +19,7 @@
 from pandas.util._decorators import Appender, Substitution
 from pandas.util._validators import validate_fillna_kwargs
 
+from pandas.core.dtypes.cast import maybe_cast_to_extension_array
 from pandas.core.dtypes.common import is_array_like, is_list_like
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
@@ -32,29 +33,6 @@
 _extension_array_shared_docs: Dict[str, str] = dict()
 
 
-def try_cast_to_ea(cls_or_instance, obj, dtype=None):
-    """
-    Call to `_from_sequence` that returns the object unchanged on Exception.
-
-    Parameters
-    ----------
-    cls_or_instance : ExtensionArray subclass or instance
-    obj : arraylike
-        Values to pass to cls._from_sequence
-    dtype : ExtensionDtype, optional
-
-    Returns
-    -------
-    ExtensionArray or obj
-    """
-    try:
-        result = cls_or_instance._from_sequence(obj, dtype=dtype)
-    except Exception:
-        # We can't predict what downstream EA constructors may raise
-        result = obj
-    return result
-
-
 class ExtensionArray:
     """
     Abstract base class for custom 1-D array types.
@@ -1214,7 +1192,7 @@ def _maybe_convert(arr):
                     # https://github.com/pandas-dev/pandas/issues/22850
                     # We catch all regular exceptions here, and fall back
                     # to an ndarray.
-                    res = try_cast_to_ea(self, arr)
+                    res = maybe_cast_to_extension_array(type(self), arr)
                     if not isinstance(res, type(self)):
                         # exception raised in _from_sequence; ensure we have ndarray
                         res = np.asarray(arr)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -19,7 +19,11 @@
 )
 from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
 
-from pandas.core.dtypes.cast import coerce_indexer_dtype, maybe_infer_to_datetimelike
+from pandas.core.dtypes.cast import (
+    coerce_indexer_dtype,
+    maybe_cast_to_extension_array,
+    maybe_infer_to_datetimelike,
+)
 from pandas.core.dtypes.common import (
     ensure_int64,
     ensure_object,
@@ -47,11 +51,7 @@
 from pandas.core.accessor import PandasDelegate, delegate_names
 import pandas.core.algorithms as algorithms
 from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d
-from pandas.core.arrays.base import (
-    ExtensionArray,
-    _extension_array_shared_docs,
-    try_cast_to_ea,
-)
+from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs
 from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs
 import pandas.core.common as com
 from pandas.core.construction import array, extract_array, sanitize_array
@@ -2568,7 +2568,7 @@ def _get_codes_for_values(values, categories):
         # scalar objects. e.g.
         # Categorical(array[Period, Period], categories=PeriodIndex(...))
         cls = categories.dtype.construct_array_type()
-        values = try_cast_to_ea(cls, values)
+        values = maybe_cast_to_extension_array(cls, values)
         if not isinstance(values, cls):
             # exception raised in _from_sequence
             values = ensure_object(values)

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -16,7 +16,7 @@
     iNaT,
 )
 from pandas._libs.tslibs.timezones import tz_compare
-from pandas._typing import Dtype
+from pandas._typing import Dtype, DtypeObj
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.common import (
@@ -246,6 +246,97 @@ def trans(x):
     return result
 
 
+def maybe_cast_result(
+    result, obj: ABCSeries, numeric_only: bool = False, how: str = ""
+):
+    """
+    Try casting result to a different type if appropriate
+
+    Parameters
+    ----------
+    result : array-like
+        Result to cast.
+    obj : ABCSeries
+        Input series from which result was calculated.
+    numeric_only : bool, default False
+        Whether to cast only numerics or datetimes as well.
+    how : str, default ""
+        How the result was computed.
+
+    Returns
+    -------
+    result : array-like
+        result maybe casted to the dtype.
+    """
+    if obj.ndim > 1:
+        dtype = obj._values.dtype
+    else:
+        dtype = obj.dtype
+    dtype = maybe_cast_result_dtype(dtype, how)
+
+    if not is_scalar(result):
+        if is_extension_array_dtype(dtype) and dtype.kind != "M":
+            # The result may be of any type, cast back to original
+            # type if it's compatible.
+            if len(result) and isinstance(result[0], dtype.type):
+                cls = dtype.construct_array_type()
+                result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
+
+        elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
+            result = maybe_downcast_to_dtype(result, dtype)
+
+    return result
+
+
+def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
+    """
+    Get the desired dtype of a result based on the
+    input dtype and how it was computed.
+
+    Parameters
+    ----------
+    dtype : DtypeObj
+        Input dtype.
+    how : str
+        How the result was computed.
+
+    Returns
+    -------
+    DtypeObj
+        The desired dtype of the result.
+    """
+    d = {
+        (np.dtype(np.bool), "add"): np.dtype(np.int64),
+        (np.dtype(np.bool), "cumsum"): np.dtype(np.int64),
+        (np.dtype(np.bool), "sum"): np.dtype(np.int64),
+    }
+    return d.get((dtype, how), dtype)
+
+
+def maybe_cast_to_extension_array(cls, obj, dtype=None):
+    """
+    Call to `_from_sequence` that returns the object unchanged on Exception.
+
+    Parameters
+    ----------
+    cls : ExtensionArray subclass
+    obj : arraylike
+        Values to pass to cls._from_sequence
+    dtype : ExtensionDtype, optional
+
+    Returns
+    -------
+    ExtensionArray or obj
+    """
+    assert isinstance(cls, type), f"must pass a type: {cls}"
+    try:
+        result = cls._from_sequence(obj, dtype=dtype)
+    except Exception:
+        # We can't predict what downstream EA constructors may raise
+        result = obj
+    return result
+
+
 def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other):
     """
     A safe version of putmask that potentially upcasts the result.

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -34,6 +34,8 @@
 from pandas.util._decorators import Appender, Substitution
 
 from pandas.core.dtypes.cast import (
+    maybe_cast_result,
+    maybe_cast_result_dtype,
     maybe_convert_objects,
     maybe_downcast_numeric,
     maybe_downcast_to_dtype,
@@ -526,7 +528,7 @@ def _transform_fast(self, result, func_nm: str) -> Series:
         cast = self._transform_should_cast(func_nm)
         out = algorithms.take_1d(result._values, ids)
         if cast:
-            out = self._try_cast(out, self.obj)
+            out = maybe_cast_result(out, self.obj, how=func_nm)
         return Series(out, index=self.obj.index, name=self.obj.name)
 
     def filter(self, func, dropna=True, *args, **kwargs):
@@ -1072,8 +1074,10 @@ def _cython_agg_blocks(
             assert not isinstance(result, DataFrame)
 
             if result is not no_result:
-                # see if we can cast the block back to the original dtype
-                result = maybe_downcast_numeric(result, block.dtype)
+                # see if we can cast the block to the desired dtype
+                # this may not be the original dtype
+                dtype = maybe_cast_result_dtype(block.dtype, how)
+                result = maybe_downcast_numeric(result, dtype)
 
                 if block.is_extension and isinstance(result, np.ndarray):
                     # e.g. block.values was an IntegerArray
@@ -1175,7 +1179,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
 
             else:
                 if cast:
-                    result[item] = self._try_cast(result[item], data)
+                    result[item] = maybe_cast_result(result[item], data)
 
         result_columns = obj.columns
         if cannot_agg:
@@ -1460,7 +1464,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame:
             # TODO: we have no test cases that get here with EA dtypes;
             #  try_cast may not be needed if EAs never get here
             if cast:
-                res = self._try_cast(res, obj.iloc[:, i])
+                res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm)
             output.append(res)
 
         return DataFrame._from_arrays(output, columns=result.columns, index=obj.index)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -39,11 +39,10 @@ class providing the base-class of operations.
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import Appender, Substitution, cache_readonly
 
-from pandas.core.dtypes.cast import maybe_downcast_to_dtype
+from pandas.core.dtypes.cast import maybe_cast_result
 from pandas.core.dtypes.common import (
     ensure_float,
     is_datetime64_dtype,
-    is_extension_array_dtype,
     is_integer_dtype,
     is_numeric_dtype,
     is_object_dtype,
@@ -53,7 +52,7 @@ class providing the base-class of operations.
 
 from pandas.core import nanops
 import pandas.core.algorithms as algorithms
-from pandas.core.arrays import Categorical, DatetimeArray, try_cast_to_ea
+from pandas.core.arrays import Categorical, DatetimeArray
 from pandas.core.base import DataError, PandasObject, SelectionMixin
 import pandas.core.common as com
 from pandas.core.frame import DataFrame
@@ -792,36 +791,6 @@ def _cumcount_array(self, ascending: bool = True):
         rev[sorter] = np.arange(count, dtype=np.intp)
         return out[rev].astype(np.int64, copy=False)
 
-    def _try_cast(self, result, obj, numeric_only: bool = False):
-        """
-        Try to cast the result to our obj original type,
-        we may have roundtripped through object in the mean-time.
-
-        If numeric_only is True, then only try to cast numerics
-        and not datetimelikes.
-
-        """
-        if obj.ndim > 1:
-            dtype = obj._values.dtype
-        else:
-            dtype = obj.dtype
-
-        if not is_scalar(result):
-            if is_extension_array_dtype(dtype) and dtype.kind != "M":
-                # The function can return something of any type, so check
-                #  if the type is compatible with the calling EA.
-                # datetime64tz is handled correctly in agg_series,
-                #  so is excluded here.
-
-                if len(result) and isinstance(result[0], dtype.type):
-                    cls = dtype.construct_array_type()
-                    result = try_cast_to_ea(cls, result, dtype=dtype)
-
-            elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
-                result = maybe_downcast_to_dtype(result, dtype)
-
-        return result
-
     def _transform_should_cast(self, func_nm: str) -> bool:
         """
         Parameters
@@ -852,7 +821,7 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs):
                 continue
 
             if self._transform_should_cast(how):
-                result = self._try_cast(result, obj)
+                result = maybe_cast_result(result, obj, how=how)
 
             key = base.OutputKey(label=name, position=idx)
             output[key] = result
@@ -895,12 +864,12 @@ def _cython_agg_general(
                 assert len(agg_names) == result.shape[1]
                 for result_column, result_name in zip(result.T, agg_names):
                     key = base.OutputKey(label=result_name, position=idx)
-                    output[key] = self._try_cast(result_column, obj)
+                    output[key] = maybe_cast_result(result_column, obj, how=how)
                     idx += 1
             else:
                 assert result.ndim == 1
                 key = base.OutputKey(label=name, position=idx)
-                output[key] = self._try_cast(result, obj)
+                output[key] = maybe_cast_result(result, obj, how=how)
                 idx += 1
 
         if len(output) == 0:
@@ -929,7 +898,7 @@ def _python_agg_general(self, func, *args, **kwargs):
 
             assert result is not None
             key = base.OutputKey(label=name, position=idx)
-            output[key] = self._try_cast(result, obj, numeric_only=True)
+            output[key] = maybe_cast_result(result, obj, numeric_only=True)
 
         if len(output) == 0:
             return self._python_apply_general(f)
@@ -944,7 +913,7 @@ def _python_agg_general(self, func, *args, **kwargs):
                 if is_numeric_dtype(values.dtype):
                     values = ensure_float(values)
 
-                output[key] = self._try_cast(values[mask], result)
+                output[key] = maybe_cast_result(values[mask], result)
 
         return self._wrap_aggregated_output(output)
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -27,7 +27,11 @@
 from pandas.util._decorators import Appender, Substitution, doc
 from pandas.util._validators import validate_bool_kwarg, validate_percentile
 
-from pandas.core.dtypes.cast import convert_dtypes, validate_numeric_casting
+from pandas.core.dtypes.cast import (
+    convert_dtypes,
+    maybe_cast_to_extension_array,
+    validate_numeric_casting,
+)
 from pandas.core.dtypes.common import (
     _is_unorderable_exception,
     ensure_platform_int,
@@ -59,7 +63,7 @@
 import pandas as pd
 from pandas.core import algorithms, base, generic, nanops, ops
 from pandas.core.accessor import CachedAccessor
-from pandas.core.arrays import ExtensionArray, try_cast_to_ea
+from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.categorical import CategoricalAccessor
 from pandas.core.arrays.sparse import SparseAccessor
 import pandas.core.common as com
@@ -2721,7 +2725,7 @@ def combine(self, other, func, fill_value=None) -> "Series":
             # TODO: can we do this for only SparseDtype?
             # The function can return something of any type, so check
             # if the type is compatible with the calling EA.
-            new_values = try_cast_to_ea(self._values, new_values)
+            new_values = maybe_cast_to_extension_array(type(self._values), new_values)
         return self._constructor(new_values, index=new_index, name=new_name)
 
     def combine_first(self, other) -> "Series":