From 3c3c15655d805a731105bf969f9070d246875870 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 24 Nov 2018 16:56:45 -0800 Subject: [PATCH 1/3] implement reductions for datetimelike dtypes --- pandas/core/arrays/datetimelike.py | 56 ++++++++++++++ pandas/core/indexes/period.py | 4 + pandas/core/nanops.py | 29 +++++++- pandas/core/series.py | 13 +++- pandas/tests/indexes/datetimes/test_ops.py | 14 ++++ pandas/tests/series/test_analytics.py | 86 +++++++++++++++++++++- 6 files changed, 193 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4e784d9c89c5f..505d48df674e7 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -12,6 +12,7 @@ from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds from pandas._libs.tslibs.timestamps import maybe_integer_op_deprecated import pandas.compat as compat +from pandas.compat.numpy import function as nv from pandas.errors import ( AbstractMethodError, NullFrequencyError, PerformanceWarning) from pandas.util._decorators import deprecate_kwarg @@ -27,6 +28,7 @@ from pandas.core.algorithms import checked_add_with_arr, take, unique1d import pandas.core.common as com +from pandas.core.nanops import nanstd from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick @@ -34,6 +36,47 @@ from .base import ExtensionOpsMixin +def _get_reduction_vals(obj, skipna): + if not len(obj): + return NaT + + if obj.hasnans: + if not skipna: + return NaT + vals = obj.asi8[~obj._isnan] + else: + vals = obj.asi8 + return vals + + +def _make_reduction(op, diff=False, only_timedelta=False): + """ + Make a unary reduction method that handles NaT appropriately. + """ + + def method(self, skipna=True, **kwargs): + if only_timedelta: + raise TypeError('"{meth}" reduction is not valid for {cls}' + .format(meth=op.__name__, cls=type(self).__name__)) + + vals = _get_reduction_vals(self, skipna) + if vals is NaT: + return NaT + + # Try to minimize floating point error by rounding before casting + # to int64 + result = op(vals, **kwargs) + result = np.float64(result).round() + result = np.int64(result) + if diff: + return self._box_func(result) - self._box_func(0) + return self._box_func(result) + + method.__name__ = op.__name__ + # TODO: __doc__ + return method + + def _make_comparison_op(cls, op): # TODO: share code with indexes.base version? Main difference is that # the block for MultiIndex was removed here. @@ -364,6 +407,19 @@ def _validate_frequency(cls, index, freq, **kwargs): 'does not conform to passed frequency {passed}' .format(infer=inferred, passed=freq.freqstr)) + # ---------------------------------------------------------------- + # Reductions + + min = _make_reduction(np.min) + max = _make_reduction(np.max) + + mean = _make_reduction(np.mean) + median = _make_reduction(np.median) + std = _make_reduction(nanstd, diff=True) + + sum = _make_reduction(np.sum, only_timedelta=True) + # cumsum = _make_reduction(np.cumsum, only_timedelta=True) + # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f83687bacd72d..d13cf36727edc 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -12,6 +12,7 @@ is_integer_dtype, is_datetime64_any_dtype, is_bool_dtype, + is_scalar, pandas_dtype ) from pandas.core.ops import get_op_result_name @@ -81,6 +82,9 @@ def _delegate_property_set(self, name, value, *args, **kwargs): def _delegate_method(self, name, *args, **kwargs): result = operator.methodcaller(name, *args, **kwargs)(self._data) + if is_scalar(result): + # e.g. min, max, mean, ... + return result return Index(result, name=self.name) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index afba433f0e391..cc05d777d3f11 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -18,7 +18,7 @@ is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, is_timedelta64_dtype, is_datetime_or_timedelta_dtype, is_int_or_datetime_dtype, is_any_int_dtype) from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype @@ -427,7 +427,6 @@ def nansum(values, axis=None, skipna=True, min_count=0, mask=None): return _wrap_results(the_sum, dtype) -@disallow('M8') @bottleneck_switch() def nanmean(values, axis=None, skipna=True, mask=None): """ @@ -463,6 +462,14 @@ def nanmean(values, axis=None, skipna=True, mask=None): elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + masked_vals = values + if mask is not None: + masked_vals = values[~mask] + the_mean = DatetimeIndex(masked_vals).mean(skipna=skipna) + return the_mean + count = _get_counts(mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) @@ -477,7 +484,6 @@ def nanmean(values, axis=None, skipna=True, mask=None): return _wrap_results(the_mean, dtype) -@disallow('M8') @bottleneck_switch() def nanmedian(values, axis=None, skipna=True, mask=None): """ @@ -509,6 +515,14 @@ def get_median(x): return np.nanmedian(x[mask]) values, mask, dtype, dtype_max, _ = _get_values(values, skipna, mask=mask) + + if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + masked_vals = values + if mask is not None: + masked_vals = values[~mask] + return DatetimeIndex(masked_vals).median(skipna=skipna) + if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -562,7 +576,6 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): return count, d -@disallow('M8') @bottleneck_switch(ddof=1) def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): """ @@ -592,6 +605,14 @@ def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanstd(s) 1.0 """ + if is_datetime64_dtype(values) or is_datetime64tz_dtype(values): + from pandas import DatetimeIndex + masked_vals = values + if mask is not None: + masked_vals = values[~mask] + return DatetimeIndex(masked_vals).std(skipna=skipna) + # TODO: adjust by ddof? + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)) return _wrap_results(result, values.dtype) diff --git a/pandas/core/series.py b/pandas/core/series.py index 892b24f6ee552..1c4022a56b2a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -29,7 +29,8 @@ is_integer, is_integer_dtype, is_iterator, is_list_like, is_object_dtype, is_scalar, is_string_like, is_timedelta64_dtype, pandas_dtype) from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCIndexClass, ABCSeries, ABCSparseArray, ABCSparseSeries) + ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCSeries, ABCSparseArray, + ABCSparseSeries) from pandas.core.dtypes.missing import ( isna, na_value_for_dtype, notna, remove_na_arraylike) @@ -3383,6 +3384,16 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, # dispatch to ExtensionArray interface if isinstance(delegate, ExtensionArray): return delegate._reduce(name, skipna=skipna, **kwds) + if (isinstance(delegate, ABCDatetimeIndex) and + name in ['mean', 'median', 'std', 'min', 'max']): + if numeric_only or filter_type: + raise TypeError + method = getattr(delegate, name) + try: + return method(skipna=skipna, **kwds) + except TypeError: + # kludge because not all reduction implementations take skipna + return method(**kwds) # dispatch to numpy arrays elif isinstance(delegate, np.ndarray): diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index d599af6180bfb..16a4fe92031f6 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -47,6 +47,20 @@ def test_ops_properties_basic(self): assert s.day == 10 pytest.raises(AttributeError, lambda: s.weekday) + def test_mean(self, tz_naive_fixture): + tz = tz_naive_fixture + idx1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], tz=tz) + assert idx1.mean() == pd.Timestamp('2011-01-02', tz=tz) + + idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', pd.NaT, + '2011-01-03'], tz=tz) + assert idx2.mean(skipna=False) is pd.NaT + assert idx2.mean(skipna=True) == pd.Timestamp('2011-01-02', tz=tz) + + idx3 = pd.DatetimeIndex([]) + assert idx3.mean() is pd.NaT + def test_minmax_tz(self, tz_naive_fixture): tz = tz_naive_fixture # monotonic diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index a5a7cc2217864..4143426206c63 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1,5 +1,6 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 +from __future__ import division from distutils.version import LooseVersion from itertools import product @@ -15,7 +16,7 @@ import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, Series, bdate_range, compat, - date_range, isna, notna) + date_range, isna, NaT, notna) from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp from pandas.core.indexes.timedeltas import Timedelta @@ -24,6 +25,7 @@ from pandas.util.testing import ( assert_almost_equal, assert_frame_equal, assert_index_equal, assert_series_equal) +from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray class TestSeriesAnalytics(object): @@ -507,6 +509,81 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + @pytest.mark.parametrize('tz', [None, 'US/Mountain']) + def test_reductions_datetime64(self, tz): + dti = date_range('2001-01-01', periods=11, tz=tz) + # shuffle so that we are not just working with monotone-increasing + dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) + dtarr = DatetimeArray(dti) + ds = Series(dti) + + for obj in [dti, ds, dtarr]: + assert obj.mean() == Timestamp('2001-01-06', tz=tz) + assert obj.median() == Timestamp('2001-01-06', tz=tz) + assert obj.min() == Timestamp('2001-01-01', tz=tz) + assert obj.max() == Timestamp('2001-01-11', tz=tz) + + assert dti.argmin() == 8 + assert dti.argmax() == 3 + # TODO: implement these + # assert dtarr.argmin() == 8 + # assert dtarr.argmax() == 3 + assert ds.idxmin() == 8 + assert ds.idxmax() == 3 + + diff = dti - dti.mean() + days_std = np.std(diff.days) + adj_std = days_std * np.sqrt(11. / 10) + nanos = round(adj_std * 24 * 3600 * 1e9) + expected = Timedelta(nanos) + assert dti.std() == expected + assert ds.std() == expected + assert dtarr.std() == expected + + dti = dti.insert(3, NaT) + dtarr = DatetimeArray(dti) + ds = Series(dti) + + assert NaT in dti + assert NaT in dtarr + # assert NaT in ds # FIXME: fails + + for obj in [dti, ds, dtarr]: + assert obj.mean(skipna=True) == Timestamp('2001-01-06', tz=tz) + assert obj.median(skipna=True) == Timestamp('2001-01-06', tz=tz) + if obj is not dti: + assert obj.min(skipna=True) == Timestamp('2001-01-01', tz=tz) + assert obj.max(skipna=True) == Timestamp('2001-01-11', tz=tz) + else: + # FIXME: signature mismatch in DatetimeIndexOpsMixin + assert obj.min() == Timestamp('2001-01-01', tz=tz) + assert obj.max() == Timestamp('2001-01-11', tz=tz) + + if obj is not ds: + assert obj.mean(skipna=False) is NaT + assert obj.median(skipna=False) is NaT + else: + # FIXME: broken for Series + pass + + if obj is not dti and obj is not ds: + assert obj.min(skipna=False) is NaT + assert obj.max(skipna=False) is NaT + else: + # FIXME: signature mismatch in DatetimeIndexOpsMixin, + # no way to not-skip NaTs + # assert obj.min() is NaT + # assert obj.max() is NaT + pass + + assert dti.std(skipna=True) == expected + assert ds.std(skipna=True) == expected + assert dtarr.std(skipna=True) == expected + + assert dti.std(skipna=False) is NaT + assert ds.std(skipna=False) is NaT + assert dtarr.std(skipna=False) is NaT + def _check_stat_op(self, name, alternate, string_series_, check_objects=False, check_allna=False): @@ -516,10 +593,11 @@ def _check_stat_op(self, name, alternate, string_series_, # add some NaNs string_series_[5:15] = np.NaN - # idxmax, idxmin, min, and max are valid for dates - if name not in ['max', 'min']: + # mean, idxmax, idxmin, min, and max are valid for dates + if name not in ['max', 'min', 'mean', 'median', 'std']: ds = Series(date_range('1/1/2001', periods=10)) - pytest.raises(TypeError, f, ds) + with pytest.raises(TypeError): + f(ds) # skipna or no assert notna(f(string_series_)) From 167989c9346dc6dfc3a387c8a4a6df76c75d4a01 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 24 Nov 2018 17:03:31 -0800 Subject: [PATCH 2/3] comment for series case --- pandas/tests/series/test_analytics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d93595b274d9c..16463b15ed01c 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -546,7 +546,7 @@ def test_reductions_datetime64(self, tz): assert NaT in dti assert NaT in dtarr - # assert NaT in ds # FIXME: fails + # it is not the case that `NaT in ds` for obj in [dti, ds, dtarr]: assert obj.mean(skipna=True) == Timestamp('2001-01-06', tz=tz) From febaf6731d01a3a6349d3637c0cc989eeb3c6bd7 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 27 Nov 2018 17:15:18 -0800 Subject: [PATCH 3/3] standardize signatures on Index reductions --- pandas/core/base.py | 12 ++++++++---- pandas/core/indexes/datetimelike.py | 8 ++++---- pandas/core/indexes/range.py | 5 ++--- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index fd303182959a5..dc6201ded4d1e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -795,7 +795,7 @@ def _ndarray_values(self): def empty(self): return not self.size - def max(self): + def max(self, skipna=True, axis=None): """ Return the maximum value of the Index. @@ -826,9 +826,10 @@ def max(self): >>> idx.max() ('b', 2) """ + nv.validate_minmax_axis(axis) return nanops.nanmax(self.values) - def argmax(self, axis=None): + def argmax(self, skipna=True, axis=None): """ Return a ndarray of the maximum argument indexer. @@ -836,9 +837,10 @@ def argmax(self, axis=None): -------- numpy.ndarray.argmax """ + nv.validate_minmax_axis(axis) return nanops.nanargmax(self.values) - def min(self): + def min(self, skipna=True, axis=None): """ Return the minimum value of the Index. @@ -869,9 +871,10 @@ def min(self): >>> idx.min() ('a', 1) """ + nv.validate_minmax_axis(axis) return nanops.nanmin(self.values) - def argmin(self, axis=None): + def argmin(self, skipna=True, axis=None): """ Return a ndarray of the minimum argument indexer. @@ -879,6 +882,7 @@ def argmin(self, axis=None): -------- numpy.ndarray.argmin """ + nv.validate_minmax_axis(axis) return nanops.nanargmin(self.values) def tolist(self): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 0e2f7ceb24e94..a19d5e5d80624 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -442,7 +442,7 @@ def tolist(self): """ return list(self.astype(object)) - def min(self, axis=None, *args, **kwargs): + def min(self, skipna=True, axis=None, *args, **kwargs): """ Return the minimum value of the Index or minimum along an axis. @@ -470,7 +470,7 @@ def min(self, axis=None, *args, **kwargs): except ValueError: return self._na_value - def argmin(self, axis=None, *args, **kwargs): + def argmin(self, skipna=True, axis=None, *args, **kwargs): """ Returns the indices of the minimum values along an axis. @@ -493,7 +493,7 @@ def argmin(self, axis=None, *args, **kwargs): i8[mask] = np.iinfo('int64').max return i8.argmin() - def max(self, axis=None, *args, **kwargs): + def max(self, skipna=True, axis=None, *args, **kwargs): """ Return the maximum value of the Index or maximum along an axis. @@ -521,7 +521,7 @@ def max(self, axis=None, *args, **kwargs): except ValueError: return self._na_value - def argmax(self, axis=None, *args, **kwargs): + def argmax(self, skipna=True, axis=None, *args, **kwargs): """ Returns the indices of the maximum values along an axis. diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d6286244fcb7e..f197bc7ee1375 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -25,7 +25,6 @@ class RangeIndex(Int64Index): - """ Immutable Index implementing a monotonic integer range. @@ -288,11 +287,11 @@ def _minmax(self, meth): return self._start + self._step * no_steps - def min(self): + def min(self, skipna=True, axis=None): """The minimum value of the RangeIndex""" return self._minmax('min') - def max(self): + def max(self, skipna=True, axis=None): """The maximum value of the RangeIndex""" return self._minmax('max')