Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: GroupBy return EA dtype #23318

Merged
merged 15 commits into from Nov 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -850,6 +850,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`).
- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)
- :meth:`Series.unstack` no longer converts extension arrays to object-dtype ndarrays. The output ``DataFrame`` will now have the same dtype as the input. This changes behavior for Categorical and Sparse data (:issue:`23077`).
- Bug when grouping :meth:`Dataframe.groupby()` and aggregating on ``ExtensionArray`` it was not returning the actual ``ExtensionArray`` dtype (:issue:`23227`).

.. _whatsnew_0240.api.incompatibilities:

Expand Down Expand Up @@ -1084,6 +1085,7 @@ Categorical
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
- Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`).
- Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`)

Datetimelike
^^^^^^^^^^^^
Expand Down
16 changes: 14 additions & 2 deletions pandas/core/groupby/groupby.py
Expand Up @@ -24,7 +24,8 @@ class providing the base-class of operations.
from pandas.util._validators import validate_kwargs

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import ensure_float, is_numeric_dtype, is_scalar
from pandas.core.dtypes.common import (
ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
from pandas.core.dtypes.missing import isna, notna

import pandas.core.algorithms as algorithms
Expand Down Expand Up @@ -754,7 +755,18 @@ def _try_cast(self, result, obj, numeric_only=False):
dtype = obj.dtype

if not is_scalar(result):
if numeric_only and is_numeric_dtype(dtype) or not numeric_only:
if is_extension_array_dtype(dtype):
# The function can return something of any type, so check
# if the type is compatible with the calling EA.
try:
result = obj.values._from_sequence(result)
except Exception:
# https://github.com/pandas-dev/pandas/issues/22850
# pandas has no control over what 3rd-party ExtensionArrays
# do in _values_from_sequence. We still want ops to work
# though, so we catch any regular Exception.
pass
elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

return result
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/arrays/test_integer.py
Expand Up @@ -650,9 +650,10 @@ def test_preserve_dtypes(op):

# groupby
result = getattr(df.groupby("A"), op)()

expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="int64")
"C": integer_array([1, 3], dtype="Int64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)

Expand All @@ -673,9 +674,10 @@ def test_reduce_to_float(op):

# groupby
result = getattr(df.groupby("A"), op)()

expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="float64")
5hirish marked this conversation as resolved.
Show resolved Hide resolved
"C": integer_array([1, 3], dtype="Int64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)

Expand Down
37 changes: 24 additions & 13 deletions pandas/tests/sparse/test_groupby.py
Expand Up @@ -24,27 +24,39 @@ def test_first_last_nth(self):
sparse_grouped = self.sparse.groupby('A')
dense_grouped = self.dense.groupby('A')

sparse_grouped_first = sparse_grouped.first()
sparse_grouped_last = sparse_grouped.last()
sparse_grouped_nth = sparse_grouped.nth(1)

dense_grouped_first = dense_grouped.first().to_sparse()
dense_grouped_last = dense_grouped.last().to_sparse()
dense_grouped_nth = dense_grouped.nth(1).to_sparse()

# TODO: shouldn't these all be spares or not?
tm.assert_frame_equal(sparse_grouped.first(),
dense_grouped.first())
tm.assert_frame_equal(sparse_grouped.last(),
dense_grouped.last())
tm.assert_frame_equal(sparse_grouped.nth(1),
dense_grouped.nth(1).to_sparse())
tm.assert_frame_equal(sparse_grouped_first,
dense_grouped_first)
tm.assert_frame_equal(sparse_grouped_last,
dense_grouped_last)
tm.assert_frame_equal(sparse_grouped_nth,
dense_grouped_nth)

def test_aggfuncs(self):
sparse_grouped = self.sparse.groupby('A')
dense_grouped = self.dense.groupby('A')

5hirish marked this conversation as resolved.
Show resolved Hide resolved
tm.assert_frame_equal(sparse_grouped.mean(),
dense_grouped.mean())
result = sparse_grouped.mean().to_sparse()
expected = dense_grouped.mean().to_sparse()

tm.assert_frame_equal(result, expected)

# ToDo: sparse sum includes str column
# tm.assert_frame_equal(sparse_grouped.sum(),
# dense_grouped.sum())

tm.assert_frame_equal(sparse_grouped.count(),
dense_grouped.count())
result = sparse_grouped.count().to_sparse()
expected = dense_grouped.count().to_sparse()

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("fill_value", [0, np.nan])
Expand All @@ -54,6 +66,5 @@ def test_groupby_includes_fill_value(fill_value):
'b': [fill_value, 1, fill_value, fill_value]})
sdf = df.to_sparse(fill_value=fill_value)
result = sdf.groupby('a').sum()
expected = df.groupby('a').sum()
tm.assert_frame_equal(result, expected,
check_index_type=False)
expected = df.groupby('a').sum().to_sparse(fill_value=fill_value)
tm.assert_frame_equal(result, expected, check_index_type=False)
1 change: 1 addition & 0 deletions pandas/tests/test_resample.py
Expand Up @@ -1576,6 +1576,7 @@ def test_resample_categorical_data_with_timedeltaindex(self):
'Group': ['A', 'A']},
index=pd.to_timedelta([0, 10], unit='s'))
expected = expected.reindex(['Group_obj', 'Group'], axis=1)
expected['Group'] = expected['Group_obj'].astype('category')
5hirish marked this conversation as resolved.
Show resolved Hide resolved
tm.assert_frame_equal(result, expected)

def test_resample_daily_anchored(self):
Expand Down