Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add prod to masked_reductions #33442

Merged
merged 16 commits into from
Apr 12, 2020
49 changes: 31 additions & 18 deletions pandas/core/array_algos/masked_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
for missing values.
"""

from typing import Callable

import numpy as np

from pandas._libs import missing as libmissing
Expand All @@ -11,14 +13,19 @@
from pandas.core.nanops import check_below_min_count


def sum(
values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0,
def _sumprod(
func: Callable,
values: np.ndarray,
mask: np.ndarray,
skipna: bool = True,
min_count: int = 0,
):
"""
Sum for 1D masked array.
Sum or product for 1D masked array.

Parameters
----------
func : np.sum or np.prod
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
Expand All @@ -31,23 +38,21 @@ def sum(
``min_count`` non-NA values are present the result will be NA.
"""
if not skipna:
if mask.any():
if mask.any() or check_below_min_count(values.shape, None, min_count):
return libmissing.NA
else:
if check_below_min_count(values.shape, None, min_count):
return libmissing.NA
return np.sum(values)
return func(values)
else:
if check_below_min_count(values.shape, mask, min_count):
return libmissing.NA

if _np_version_under1p17:
return np.sum(values[~mask])
return func(values[~mask])
else:
return np.sum(values, where=~mask)
return func(values, where=~mask)


def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
"""
Reduction for 1D masked array.

Expand All @@ -63,17 +68,13 @@ def _minmax(func, values: np.ndarray, mask: np.ndarray, skipna: bool = True):
Whether to skip NA.
"""
if not skipna:
if mask.any():
if mask.any() or not values.size:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
else:
if values.size:
return func(values)
else:
# min/max with empty array raise in numpy, pandas returns NA
return libmissing.NA
return func(values)
else:
subset = values[~mask]
if subset.size:
if not mask.all():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing a mask.all() might on average be expensive here, since in most cases you will actually need the subset afterwards

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, I see what you're saying. I'll switch that back

return func(values[~mask])
else:
# min/max with empty array raise in numpy, pandas returns NA
Expand All @@ -86,3 +87,15 @@ def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True):

def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True):
return _minmax(np.max, values=values, mask=mask, skipna=skipna)


def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
dsaxton marked this conversation as resolved.
Show resolved Hide resolved
return _sumprod(
np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count
)


def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0):
return _sumprod(
np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count
)
8 changes: 1 addition & 7 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -710,12 +710,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if np.isnan(result):
return libmissing.NA

# if we have numeric op that would result in an int, coerce to int if possible
if name == "prod" and notna(result):
int_result = np.int64(result)
if int_result == result:
result = int_result

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
8 changes: 1 addition & 7 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name in {"sum", "min", "max"}:
if name in {"sum", "prod", "min", "max"}:
op = getattr(masked_reductions, name)
return op(data, mask, skipna=skipna, **kwargs)

Expand All @@ -581,12 +581,6 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if name in ["any", "all"]:
pass

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name == "prod":
# GH#31409 more performant than casting-then-checking
result = com.cast_scalar_indexer(result)

return result

def _maybe_mask_result(self, result, mask, other, op_name: str):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/arrays/integer/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_preserve_dtypes(op):

# op
result = getattr(df.C, op)()
if op in {"sum", "min", "max"}:
if op in {"sum", "prod", "min", "max"}:
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)
Expand Down