Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: implement reductions for DatetimeArray/TimedeltaArray/PeriodArray #23890

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds
from pandas._libs.tslibs.timestamps import maybe_integer_op_deprecated
import pandas.compat as compat
from pandas.compat.numpy import function as nv
from pandas.errors import (
AbstractMethodError, NullFrequencyError, PerformanceWarning)
from pandas.util._decorators import deprecate_kwarg
Expand All @@ -27,13 +28,55 @@

from pandas.core.algorithms import checked_add_with_arr, take, unique1d
import pandas.core.common as com
from pandas.core.nanops import nanstd

from pandas.tseries import frequencies
from pandas.tseries.offsets import DateOffset, Tick

from .base import ExtensionOpsMixin


def _get_reduction_vals(obj, skipna):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you seem to be reinventing the wheel here. we already do all of this in nanops.py for timedelta. I am not sure how this should be integrated here, but this is not the way. (meaning re-write all of the code we already have)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is definitely a Proof Of Concept. For now the main question for you is if you're on board with the idea of bringing the Index reduction signatures in line with everything else

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes about the signatures generally. but maybe let's start with min/max as more straightforward? To do what you are suggestnig here will require using nanops (the interface can certainly be here), but the implementation is already there (in nanops).

if not len(obj):
return NaT

if obj.hasnans:
if not skipna:
return NaT
vals = obj.asi8[~obj._isnan]
else:
vals = obj.asi8
return vals


def _make_reduction(op, diff=False, only_timedelta=False):
"""
Make a unary reduction method that handles NaT appropriately.
"""

def method(self, skipna=True, **kwargs):
if only_timedelta:
raise TypeError('"{meth}" reduction is not valid for {cls}'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this. only_timedelta makes me think we shouldn't raise here if the array is timedelta dtype, but right now it looks like we raise unconditionally.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woops, that should be if only_timedelta and not is_timedelta64_dtype(self)

.format(meth=op.__name__, cls=type(self).__name__))

vals = _get_reduction_vals(self, skipna)
if vals is NaT:
return NaT

# Try to minimize floating point error by rounding before casting
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does DatetimeIndex do this casting?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only reductions DatetimeIndex has is min and max, for which it is not relevant.

# to int64
result = op(vals, **kwargs)
result = np.float64(result).round()
result = np.int64(result)
if diff:
return self._box_func(result) - self._box_func(0)
return self._box_func(result)

method.__name__ = op.__name__
# TODO: __doc__
return method


def _make_comparison_op(cls, op):
# TODO: share code with indexes.base version? Main difference is that
# the block for MultiIndex was removed here.
Expand Down Expand Up @@ -364,6 +407,19 @@ def _validate_frequency(cls, index, freq, **kwargs):
'does not conform to passed frequency {passed}'
.format(infer=inferred, passed=freq.freqstr))

# ----------------------------------------------------------------
# Reductions

min = _make_reduction(np.min)
max = _make_reduction(np.max)

mean = _make_reduction(np.mean)
median = _make_reduction(np.median)
std = _make_reduction(nanstd, diff=True)

sum = _make_reduction(np.sum, only_timedelta=True)
# cumsum = _make_reduction(np.cumsum, only_timedelta=True)

# ------------------------------------------------------------------
# Arithmetic Methods

Expand Down
12 changes: 8 additions & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ def _ndarray_values(self):
def empty(self):
return not self.size

def max(self):
def max(self, skipna=True, axis=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Series, these are all (self, axis=None, skipna=None, ...).

"""
Return the maximum value of the Index.

Expand Down Expand Up @@ -826,19 +826,21 @@ def max(self):
>>> idx.max()
('b', 2)
"""
nv.validate_minmax_axis(axis)
return nanops.nanmax(self.values)

def argmax(self, axis=None):
def argmax(self, skipna=True, axis=None):
"""
Return a ndarray of the maximum argument indexer.

See Also
--------
numpy.ndarray.argmax
"""
nv.validate_minmax_axis(axis)
return nanops.nanargmax(self.values)

def min(self):
def min(self, skipna=True, axis=None):
"""
Return the minimum value of the Index.

Expand Down Expand Up @@ -869,16 +871,18 @@ def min(self):
>>> idx.min()
('a', 1)
"""
nv.validate_minmax_axis(axis)
return nanops.nanmin(self.values)

def argmin(self, axis=None):
def argmin(self, skipna=True, axis=None):
"""
Return a ndarray of the minimum argument indexer.

See Also
--------
numpy.ndarray.argmin
"""
nv.validate_minmax_axis(axis)
return nanops.nanargmin(self.values)

def tolist(self):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def tolist(self):
"""
return list(self.astype(object))

def min(self, axis=None, *args, **kwargs):
def min(self, skipna=True, axis=None, *args, **kwargs):
"""
Return the minimum value of the Index or minimum along
an axis.
Expand Down Expand Up @@ -470,7 +470,7 @@ def min(self, axis=None, *args, **kwargs):
except ValueError:
return self._na_value

def argmin(self, axis=None, *args, **kwargs):
def argmin(self, skipna=True, axis=None, *args, **kwargs):
"""
Returns the indices of the minimum values along an axis.

Expand All @@ -493,7 +493,7 @@ def argmin(self, axis=None, *args, **kwargs):
i8[mask] = np.iinfo('int64').max
return i8.argmin()

def max(self, axis=None, *args, **kwargs):
def max(self, skipna=True, axis=None, *args, **kwargs):
"""
Return the maximum value of the Index or maximum along
an axis.
Expand Down Expand Up @@ -521,7 +521,7 @@ def max(self, axis=None, *args, **kwargs):
except ValueError:
return self._na_value

def argmax(self, axis=None, *args, **kwargs):
def argmax(self, skipna=True, axis=None, *args, **kwargs):
"""
Returns the indices of the maximum values along an axis.

Expand Down
5 changes: 4 additions & 1 deletion pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from pandas.core.dtypes.common import (
is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype,
is_integer, is_integer_dtype, pandas_dtype)
is_integer, is_integer_dtype, is_scalar, pandas_dtype)

from pandas import compat
from pandas.core import common as com
Expand Down Expand Up @@ -72,6 +72,9 @@ def _delegate_property_set(self, name, value, *args, **kwargs):

def _delegate_method(self, name, *args, **kwargs):
result = operator.methodcaller(name, *args, **kwargs)(self._data)
if is_scalar(result):
# e.g. min, max, mean, ...
return result
return Index(result, name=self.name)


Expand Down
5 changes: 2 additions & 3 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@


class RangeIndex(Int64Index):

"""
Immutable Index implementing a monotonic integer range.

Expand Down Expand Up @@ -288,11 +287,11 @@ def _minmax(self, meth):

return self._start + self._step * no_steps

def min(self):
def min(self, skipna=True, axis=None):
"""The minimum value of the RangeIndex"""
return self._minmax('min')

def max(self):
def max(self, skipna=True, axis=None):
"""The maximum value of the RangeIndex"""
return self._minmax('max')

Expand Down
31 changes: 26 additions & 5 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
from pandas.core.dtypes.common import (
_get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype,
is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float,
is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype,
is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
is_float, is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype,
is_object_dtype, is_scalar, is_timedelta64_dtype)
from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna

Expand Down Expand Up @@ -426,7 +426,6 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
return _wrap_results(the_sum, dtype)


@disallow('M8')
@bottleneck_switch()
def nanmean(values, axis=None, skipna=True, mask=None):
"""
Expand Down Expand Up @@ -462,6 +461,14 @@ def nanmean(values, axis=None, skipna=True, mask=None):
elif is_float_dtype(dtype):
dtype_sum = dtype
dtype_count = dtype
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is_datetime64_any_dtype I think.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yah, but I'm kind of hoping to get rid of that

from pandas import DatetimeIndex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this need to be boxed in an index? Shouldn't values be a DatetimeArray right now? Or is it not yet since we haven't implemented DTA as an extension array?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know this module especially well, but my assumption is that it could be a numpy array at this point.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that Ideally nanmean will only be called via Series.mean and our ExtensionArray's .mean methods. Though I may be missing some cases.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose the exception will be Series[datetime64[ns]], which may pass an ndarray here...

masked_vals = values
if mask is not None:
masked_vals = values[~mask]
the_mean = DatetimeIndex(masked_vals).mean(skipna=skipna)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls try to follow the patterns in this module
_wrap_results exists for a reason

again i see lots of reinventing the wheel

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing in this PR has been updated since previous conversation. I’ll get around to this suggestion before too long.

return the_mean

count = _get_counts(mask, axis, dtype=dtype_count)
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))

Expand All @@ -476,7 +483,6 @@ def nanmean(values, axis=None, skipna=True, mask=None):
return _wrap_results(the_mean, dtype)


@disallow('M8')
@bottleneck_switch()
def nanmedian(values, axis=None, skipna=True, mask=None):
"""
Expand Down Expand Up @@ -508,6 +514,14 @@ def get_median(x):
return np.nanmedian(x[mask])

values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask)

if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
from pandas import DatetimeIndex
masked_vals = values
if mask is not None:
masked_vals = values[~mask]
return DatetimeIndex(masked_vals).median(skipna=skipna)

if not is_float_dtype(values):
values = values.astype('f8')
values[mask] = np.nan
Expand Down Expand Up @@ -561,7 +575,6 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
return count, d


@disallow('M8')
@bottleneck_switch(ddof=1)
def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
"""
Expand Down Expand Up @@ -591,6 +604,14 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
>>> nanops.nanstd(s)
1.0
"""
if is_datetime64_dtype(values) or is_datetime64tz_dtype(values):
from pandas import DatetimeIndex
masked_vals = values
if mask is not None:
masked_vals = values[~mask]
return DatetimeIndex(masked_vals).std(skipna=skipna)
# TODO: adjust by ddof?

result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof,
mask=mask))
return _wrap_results(result, values.dtype)
Expand Down
13 changes: 12 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
is_integer, is_integer_dtype, is_iterator, is_list_like, is_object_dtype,
is_scalar, is_string_like, is_timedelta64_dtype, pandas_dtype)
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries)
ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCSeries, ABCSparseArray,
ABCSparseSeries)
from pandas.core.dtypes.missing import (
isna, na_value_for_dtype, notna, remove_na_arraylike)

Expand Down Expand Up @@ -3454,6 +3455,16 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
# dispatch to ExtensionArray interface
if isinstance(delegate, ExtensionArray):
return delegate._reduce(name, skipna=skipna, **kwds)
if (isinstance(delegate, ABCDatetimeIndex) and
name in ['mean', 'median', 'std', 'min', 'max']):
if numeric_only or filter_type:
raise TypeError
method = getattr(delegate, name)
try:
return method(skipna=skipna, **kwds)
except TypeError:
# kludge because not all reduction implementations take skipna
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
return method(**kwds)

# dispatch to numpy arrays
elif isinstance(delegate, np.ndarray):
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/indexes/datetimes/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ def test_ops_properties_basic(self):
assert s.day == 10
pytest.raises(AttributeError, lambda: s.weekday)

def test_mean(self, tz_naive_fixture):
tz = tz_naive_fixture
idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02',
'2011-01-03'], tz=tz)
assert idx1.mean() == pd.Timestamp('2011-01-02', tz=tz)

idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', pd.NaT,
'2011-01-03'], tz=tz)
assert idx2.mean(skipna=False) is pd.NaT
assert idx2.mean(skipna=True) == pd.Timestamp('2011-01-02', tz=tz)

idx3 = pd.DatetimeIndex([])
assert idx3.mean() is pd.NaT

def test_minmax_tz(self, tz_naive_fixture):
tz = tz_naive_fixture
# monotonic
Expand Down
Loading