Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add groupby & reduce support to EA #22762

Merged
merged 23 commits into from
Oct 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Pandas has gained the ability to hold integer dtypes with missing values. This l
Here is an example of the usage.

We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`)
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`)

.. ipython:: python

Expand Down Expand Up @@ -91,6 +91,13 @@ These dtypes can be merged & reshaped & casted.
pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes
df['A'].astype(float)

Reduction and groupby operations such as 'sum' work.
jreback marked this conversation as resolved.
Show resolved Hide resolved

.. ipython:: python

df.sum()
df.groupby('B').A.sum()

.. warning::

The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date.
Expand Down Expand Up @@ -550,6 +557,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)
- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`)
- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`)
- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`)

.. _whatsnew_0240.api.incompatibilities:

Expand Down
24 changes: 24 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,30 @@ def all_arithmetic_operators(request):
return request.param


_all_numeric_reductions = ['sum', 'max', 'min',
'mean', 'prod', 'std', 'var', 'median',
'kurt', 'skew']


@pytest.fixture(params=_all_numeric_reductions)
def all_numeric_reductions(request):
"""
Fixture for numeric reduction names
"""
return request.param


_all_boolean_reductions = ['all', 'any']


@pytest.fixture(params=_all_boolean_reductions)
def all_boolean_reductions(request):
"""
Fixture for boolean reduction names
"""
return request.param


_cython_table = pd.core.base.SelectionMixin._cython_table.items()


Expand Down
31 changes: 31 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ class ExtensionArray(object):
as they only compose abstract methods. Still, a more efficient
implementation may be available, and these methods can be overridden.

One can implement methods to handle array reductions.

* _reduce

This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
Expand Down Expand Up @@ -675,6 +679,33 @@ def _ndarray_values(self):
"""
return np.array(self)

def _reduce(self, name, skipna=True, **kwargs):
"""
Return a scalar result of performing the reduction operation.

Parameters
----------
name : str
Name of the function, supported values are:
{ any, all, min, max, sum, mean, median, prod,
std, var, sem, kurt, skew }.
skipna : bool, default True
If True, skip NaN values.
**kwargs
Additional keyword arguments passed to the reduction function.
Currently, `ddof` is the only supported kwarg.

jreback marked this conversation as resolved.
Show resolved Hide resolved
Returns
-------
scalar

Raises
------
TypeError : subclass does not define reductions
"""
raise TypeError("cannot perform {name} with type {dtype}".format(
name=name, dtype=self.dtype))


class ExtensionOpsMixin(object):
"""
Expand Down
6 changes: 2 additions & 4 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2069,14 +2069,12 @@ def _reverse_indexer(self):
return result

# reduction ops #
def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
filter_type=None, **kwds):
""" perform the reduction type operation """
def _reduce(self, name, axis=0, skipna=True, **kwargs):
func = getattr(self, name, None)
if func is None:
msg = 'Categorical cannot perform the operation {op}'
raise TypeError(msg.format(op=name))
return func(numeric_only=numeric_only, **kwds)
return func(**kwargs)
jreback marked this conversation as resolved.
Show resolved Hide resolved

def min(self, numeric_only=None, **kwargs):
""" The minimum value of the object.
Expand Down
26 changes: 26 additions & 0 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from pandas.compat import u, range, string_types
from pandas.compat import set_function_name

from pandas.core import nanops
from pandas.core.dtypes.cast import astype_nansafe
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -529,6 +530,31 @@ def cmp_method(self, other):
name = '__{name}__'.format(name=op.__name__)
return set_function_name(cmp_method, name, cls)

def _reduce(self, name, skipna=True, **kwargs):
data = self._data
mask = self._mask

# coerce to a nan-aware float if needed
if mask.any():
jreback marked this conversation as resolved.
Show resolved Hide resolved
data = self._data.astype('float64')
data[mask] = self._na_value

jreback marked this conversation as resolved.
Show resolved Hide resolved
op = getattr(nanops, 'nan' + name)
result = op(data, axis=0, skipna=skipna, mask=mask)

# if we have a boolean op, don't coerce
if name in ['any', 'all']:
pass

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name in ['sum', 'min', 'max', 'prod'] and notna(result):
int_result = int(result)
if int_result == result:
result = int_result
jreback marked this conversation as resolved.
Show resolved Hide resolved

return result

def _maybe_mask_result(self, result, mask, other, op_name):
"""
Parameters
Expand Down
17 changes: 13 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3392,16 +3392,25 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,

"""
delegate = self._values
if isinstance(delegate, np.ndarray):
# Validate that 'axis' is consistent with Series's single axis.
if axis is not None:
self._get_axis_number(axis)

if axis is not None:
self._get_axis_number(axis)

# dispatch to ExtensionArray interface
if isinstance(delegate, ExtensionArray):
return delegate._reduce(name, skipna=skipna, **kwds)

# dispatch to numpy arrays
elif isinstance(delegate, np.ndarray):
jreback marked this conversation as resolved.
Show resolved Hide resolved
if numeric_only:
raise NotImplementedError('Series.{0} does not implement '
'numeric_only.'.format(name))
with np.errstate(all='ignore'):
return op(delegate, skipna=skipna, **kwds)

# TODO(EA) dispatch to Index
# remove once all internals extension types are
# moved to ExtensionArrays
return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna,
numeric_only=numeric_only,
filter_type=filter_type, **kwds)
Expand Down
45 changes: 41 additions & 4 deletions pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ def _check_op(self, s, op_name, other, exc=None):
# compute expected
mask = s.isna()

# if s is a DataFrame, squeeze to a Series
# for comparison
if isinstance(s, pd.DataFrame):
result = result.squeeze()
s = s.squeeze()
mask = mask.squeeze()

# other array is an Integer
if isinstance(other, IntegerArray):
omask = getattr(other, 'mask', None)
Expand Down Expand Up @@ -215,7 +222,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
s = pd.Series(data)
self._check_op(s, op, 1, exc=TypeError)

@pytest.mark.xfail(run=False, reason="_reduce needs implementation")
def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
# frame & scalar
op = all_arithmetic_operators
Expand Down Expand Up @@ -587,22 +593,53 @@ def test_cross_type_arithmetic():
tm.assert_series_equal(result, expected)


def test_groupby_mean_included():
@pytest.mark.parametrize('op', ['sum', 'min', 'max', 'prod'])
def test_preserve_dtypes(op):
# TODO(#22346): preserve Int64 dtype
# for ops that enable (mean would actually work here
# but generally it is a float return value)
df = pd.DataFrame({
"A": ['a', 'b', 'b'],
"B": [1, None, 3],
"C": integer_array([1, None, 3], dtype='Int64'),
})

result = df.groupby("A").sum()
# TODO(#22346): preserve Int64 dtype
# op
result = getattr(df.C, op)()
assert isinstance(result, int)
jreback marked this conversation as resolved.
Show resolved Hide resolved

# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="int64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize('op', ['mean'])
def test_reduce_to_float(op):
# some reduce ops always return float, even if the result
# is a rounded number
df = pd.DataFrame({
"A": ['a', 'b', 'b'],
"B": [1, None, 3],
"C": integer_array([1, None, 3], dtype='Int64'),
})

# op
result = getattr(df.C, op)()
assert isinstance(result, float)

# groupby
result = getattr(df.groupby("A"), op)()
expected = pd.DataFrame({
"B": np.array([1.0, 3.0]),
"C": np.array([1, 3], dtype="float64")
}, index=pd.Index(['a', 'b'], name='A'))
tm.assert_frame_equal(result, expected)


def test_astype_nansafe():
# https://github.com/pandas-dev/pandas/pull/22343
arr = integer_array([np.nan, 1, 2], dtype="Int8")
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/dtypes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,8 @@ def test_is_datetime_or_timedelta_dtype():
assert not com.is_datetime_or_timedelta_dtype(str)
assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2]))
assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b']))
assert not com.is_datetime_or_timedelta_dtype(
DatetimeTZDtype("ns", "US/Eastern"))

assert com.is_datetime_or_timedelta_dtype(np.datetime64)
assert com.is_datetime_or_timedelta_dtype(np.timedelta64)
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/arrow/test_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ def test_from_dtype(self, data):
pytest.skip("GH-22666")


class TestReduce(base.BaseNoReduceTests):
pass


def test_is_bool_dtype(data):
assert pd.api.types.is_bool_dtype(data)
assert pd.core.common.is_bool_indexer(data)
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/extension/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class TestMyDtype(BaseDtypeTests):
from .interface import BaseInterfaceTests # noqa
from .methods import BaseMethodsTests # noqa
from .ops import BaseArithmeticOpsTests, BaseComparisonOpsTests, BaseOpsUtil # noqa
from .reduce import BaseNoReduceTests, BaseNumericReduceTests, BaseBooleanReduceTests # noqa
from .missing import BaseMissingTests # noqa
from .reshaping import BaseReshapingTests # noqa
from .setitem import BaseSetitemTests # noqa
8 changes: 4 additions & 4 deletions pandas/tests/extension/base/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping):
"B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")

index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
Expand All @@ -39,8 +39,8 @@ def test_groupby_extension_no_sort(self, data_for_grouping):
"B": data_for_grouping})
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)
# TODO(ExtensionIndex): remove astype
index = pd.Index(index.astype(object), name="B")

index = pd.Index(index, name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
self.assert_series_equal(result, expected)

Expand Down
58 changes: 58 additions & 0 deletions pandas/tests/extension/base/reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import warnings
import pytest
import pandas.util.testing as tm
import pandas as pd
from .base import BaseExtensionTests


class BaseReduceTests(BaseExtensionTests):
"""
Reduction specific tests. Generally these only
make sense for numeric/boolean operations.
"""
def check_reduce(self, s, op_name, skipna):
result = getattr(s, op_name)(skipna=skipna)
expected = getattr(s.astype('float64'), op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)


class BaseNoReduceTests(BaseReduceTests):
""" we don't define any reductions """

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
s = pd.Series(data)

with pytest.raises(TypeError):
getattr(s, op_name)(skipna=skipna)

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
s = pd.Series(data)

with pytest.raises(TypeError):
getattr(s, op_name)(skipna=skipna)


class BaseNumericReduceTests(BaseReduceTests):

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series(self, data, all_numeric_reductions, skipna):
op_name = all_numeric_reductions
jreback marked this conversation as resolved.
Show resolved Hide resolved
s = pd.Series(data)

# min/max with empty produce numpy warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
self.check_reduce(s, op_name, skipna)


class BaseBooleanReduceTests(BaseReduceTests):

@pytest.mark.parametrize('skipna', [True, False])
def test_reduce_series(self, data, all_boolean_reductions, skipna):
op_name = all_boolean_reductions
s = pd.Series(data)
self.check_reduce(s, op_name, skipna)
Loading