Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: GroupBy return EA dtype #23318

Merged
merged 15 commits into from Nov 6, 2018
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
17 changes: 15 additions & 2 deletions pandas/core/groupby/groupby.py
Expand Up @@ -27,7 +27,9 @@ class providing the base-class of operations.
)
from pandas.core.config import option_context
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import ensure_float, is_numeric_dtype, is_scalar
from pandas.core.dtypes.common import (
ensure_float, is_numeric_dtype, is_scalar, is_extension_array_dtype
)
from pandas.core.dtypes.missing import isna, notna
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
Expand Down Expand Up @@ -753,7 +755,18 @@ def _try_cast(self, result, obj, numeric_only=False):
dtype = obj.dtype

if not is_scalar(result):
if numeric_only and is_numeric_dtype(dtype) or not numeric_only:
if is_extension_array_dtype(dtype):
# The function can return something of any type, so check
# if the type is compatible with the calling EA.
try:
result = obj.values._from_sequence(result)
except Exception:
# https://github.com/pandas-dev/pandas/issues/22850
# pandas has no control over what 3rd-party ExtensionArrays
# do in _values_from_sequence. We still want ops to work
# though, so we catch any regular Exception.
pass
elif numeric_only and is_numeric_dtype(dtype) or not numeric_only:
result = maybe_downcast_to_dtype(result, dtype)

return result
Expand Down
30 changes: 20 additions & 10 deletions pandas/tests/arrays/test_integer.py
Expand Up @@ -652,11 +652,9 @@ def test_preserve_dtypes(op):

# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="int64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fully construct the return frame as in the existing example

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean this
a)

result = df.groupby("A").op()
assert result.dtypes['C'].name == 'Int64'

b) or do you mean that I should construct the expected as:

expected = pd.DataFrame({	
        "B": np.array([1.0, 3.0]),
        "C": np.array([1, 3], dtype="Int64")
    }, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)

Because in b) the expected constructed dataframe converts Int64 to int64.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because in b) the expected constructed dataframe converts Int64 to int64

The "Int64" (capital "I") is a pandas thing. You need

"C": integer_array([1, 3], dtype="Int64")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the 2nd. we want to construct the resulting frame and compare

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why was this marked resolved? Am I missing the expected and the tm.assert_frame_equal?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, you are correct, I missed that one...


assert result.dtypes['B'].name == 'float64'
assert result.dtypes['C'].name == 'Int64'


@pytest.mark.parametrize('op', ['mean'])
Expand All @@ -675,11 +673,23 @@ def test_reduce_to_float(op):

# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="float64")
5hirish marked this conversation as resolved.
Show resolved Hide resolved
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)

assert result.dtypes['B'].name == 'float64'
assert result.dtypes['C'].name == 'Int64'


@pytest.mark.parametrize('op', ['sum'])
5hirish marked this conversation as resolved.
Show resolved Hide resolved
def test_groupby_extension_array(op):
# GH23227
# groupby on an extension array should return the extension array type
df = pd.DataFrame({
'Int': pd.Series([1, 2, 3], dtype='Int64'),
'A': [1, 2, 1]
})

result = getattr(df.groupby('A').Int, op)()
assert result is not None
assert result.dtype.name == 'Int64'


def test_astype_nansafe():
Expand Down
14 changes: 7 additions & 7 deletions pandas/tests/sparse/test_groupby.py
Expand Up @@ -26,25 +26,25 @@ def test_first_last_nth(self):

# TODO: shouldn't these all be spares or not?
tm.assert_frame_equal(sparse_grouped.first(),
dense_grouped.first())
dense_grouped.first().to_sparse())
tm.assert_frame_equal(sparse_grouped.last(),
dense_grouped.last())
dense_grouped.last().to_sparse())
tm.assert_frame_equal(sparse_grouped.nth(1),
dense_grouped.nth(1).to_sparse())

def test_aggfuncs(self):
sparse_grouped = self.sparse.groupby('A')
dense_grouped = self.dense.groupby('A')

5hirish marked this conversation as resolved.
Show resolved Hide resolved
tm.assert_frame_equal(sparse_grouped.mean(),
dense_grouped.mean())
tm.assert_frame_equal(sparse_grouped.mean().to_sparse(),
dense_grouped.mean().to_sparse())

# ToDo: sparse sum includes str column
# tm.assert_frame_equal(sparse_grouped.sum(),
# dense_grouped.sum())

tm.assert_frame_equal(sparse_grouped.count(),
dense_grouped.count())
tm.assert_frame_equal(sparse_grouped.count().to_sparse(),
dense_grouped.count().to_sparse())


@pytest.mark.parametrize("fill_value", [0, np.nan])
Expand All @@ -55,5 +55,5 @@ def test_groupby_includes_fill_value(fill_value):
sdf = df.to_sparse(fill_value=fill_value)
result = sdf.groupby('a').sum()
expected = df.groupby('a').sum()
tm.assert_frame_equal(result, expected,
tm.assert_frame_equal(result, expected.to_sparse(fill_value=fill_value),
5hirish marked this conversation as resolved.
Show resolved Hide resolved
check_index_type=False)
1 change: 1 addition & 0 deletions pandas/tests/test_resample.py
Expand Up @@ -1576,6 +1576,7 @@ def test_resample_categorical_data_with_timedeltaindex(self):
'Group': ['A', 'A']},
index=pd.to_timedelta([0, 10], unit='s'))
expected = expected.reindex(['Group_obj', 'Group'], axis=1)
expected['Group'] = expected['Group_obj'].astype('category')
5hirish marked this conversation as resolved.
Show resolved Hide resolved
tm.assert_frame_equal(result, expected)

def test_resample_daily_anchored(self):
Expand Down