From 1dc49f51afe67fdc17fa2670545c053775765ebc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 2 Feb 2016 09:14:39 -0600 Subject: [PATCH] Refactored Resample API breaking change closes #11732 closes #12072 closes #9052 closes #12140 Author: Jeff Reback Closes #11841 from jreback/resample and squashes the following commits: b2056ca [Jeff Reback] DOC: clean up aggregations docs, removing from whatsnew b4dfbc5 [Jeff Reback] fix according to comments e243f18 [Jeff Reback] API: add doc examples for #9052 750556b [Jeff Reback] raise SpecificationError if we have an invalid aggregator c54ea69 [Jeff Reback] PEP updates 68428d6 [Jeff Reback] API: disallow renamed nested-dicts 83238ed [Jeff Reback] BUG: timedelta resample idempotency, #12072 e570570 [Jeff Reback] ENH: .resample API to groupby-like class, #11732 --- doc/source/api.rst | 59 ++ doc/source/cookbook.rst | 2 +- doc/source/release.rst | 7 +- doc/source/timedeltas.rst | 2 +- doc/source/timeseries.rst | 104 ++- doc/source/whatsnew/v0.10.0.txt | 64 +- doc/source/whatsnew/v0.18.0.txt | 168 ++++ doc/source/whatsnew/v0.9.1.txt | 17 +- pandas/core/base.py | 139 ++- pandas/core/generic.py | 79 +- pandas/core/groupby.py | 1043 +++++++++++++---------- pandas/core/ops.py | 36 +- pandas/core/window.py | 2 +- pandas/io/tests/test_excel.py | 2 +- pandas/tests/test_generic.py | 6 +- pandas/tests/test_groupby.py | 127 ++- pandas/tests/test_multilevel.py | 4 +- pandas/tests/test_window.py | 56 +- pandas/tseries/plotting.py | 4 +- pandas/tseries/resample.py | 885 +++++++++++++++---- pandas/tseries/tests/test_resample.py | 946 +++++++++++++++----- pandas/tseries/tests/test_timeseries.py | 10 +- pandas/util/testing.py | 19 +- 23 files changed, 2784 insertions(+), 997 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 3a6b31ceeeece..6ab7a20d6b41f 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1729,6 +1729,65 @@ The following methods are available only for ``DataFrameGroupBy`` objects. DataFrameGroupBy.corrwith DataFrameGroupBy.boxplot +Resampling +---------- +.. currentmodule:: pandas.tseries.resample + +Resampler objects are returned by resample calls: :func:`pandas.DataFrame.resample`, :func:`pandas.Series.resample`. + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Resampler.__iter__ + Resampler.groups + Resampler.indices + Resampler.get_group + +Function application +~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Resampler.apply + Resampler.aggregate + Resampler.transform + +Upsampling +~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + Resampler.ffill + Resampler.backfill + Resampler.bfill + Resampler.pad + Resampler.fillna + Resampler.asfreq + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Resampler.count + Resampler.nunique + Resampler.first + Resampler.last + Resampler.max + Resampler.mean + Resampler.median + Resampler.min + Resampler.ohlc + Resampler.prod + Resampler.size + Resampler.sem + Resampler.std + Resampler.sum + Resampler.var + Style ----- .. currentmodule:: pandas.core.style diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index b69749839fc0e..1d301c1ee2f19 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -567,7 +567,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to return pd.NaT mhc = {'Mean' : np.mean, 'Max' : np.max, 'Custom' : MyCust} - ts.resample("5min",how = mhc) + ts.resample("5min").apply(mhc) ts `Create a value counts column and reassign back to the DataFrame diff --git a/doc/source/release.rst b/doc/source/release.rst index e6ab9ba574a1e..2a5168ee260a8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -48,7 +48,12 @@ users upgrade to this version. Highlights include: -See the :ref:`v0.17.0 Whatsnew ` overview for an extensive list +Highlights include: + +- Window functions are now methods on ``.groupby`` like objects, see :ref:`here `. +- API breaking ``.resample`` changes to make it more ``.groupby`` like, see :ref:`here `. + +See the :ref:`v0.18.0 Whatsnew ` overview for an extensive list of all enhancements and bugs that have been fixed in 0.17.1. Thanks diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index c9aa10478714a..29a75f3423cfa 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -401,4 +401,4 @@ Similar to :ref:`timeseries resampling `, we can resample .. ipython:: python - s.resample('D') + s.resample('D').mean() diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 80a4774e02e69..a986c3e1cb065 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -68,7 +68,7 @@ Resample: .. ipython:: python # Daily means - ts.resample('D', how='mean') + ts.resample('D').mean() .. _timeseries.overview: @@ -1211,6 +1211,11 @@ Converting to Python datetimes Resampling ---------- +.. warning:: + + The interface to ``.resample`` has changed in 0.18.0 to be more groupby-like and hence more flexible. + See the :ref:`whatsnew docs ` for a comparison with prior versions. + Pandas has a simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not @@ -1226,7 +1231,7 @@ See some :ref:`cookbook examples ` for some advanced strategi ts = Series(randint(0, 500, len(rng)), index=rng) - ts.resample('5Min', how='sum') + ts.resample('5Min').sum() The ``resample`` function is very flexible and allows you to specify many different parameters to control the frequency conversion and resampling @@ -1237,11 +1242,11 @@ an array and produces aggregated values: .. ipython:: python - ts.resample('5Min') # default is mean + ts.resample('5Min').mean() - ts.resample('5Min', how='ohlc') + ts.resample('5Min').ohlc() - ts.resample('5Min', how=np.max) + ts.resample('5Min').max() Any function available via :ref:`dispatching ` can be given to the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``sem``, @@ -1252,9 +1257,9 @@ end of the interval is closed: .. ipython:: python - ts.resample('5Min', closed='right') + ts.resample('5Min', closed='right').mean() - ts.resample('5Min', closed='left') + ts.resample('5Min', closed='left').mean() Parameters like ``label`` and ``loffset`` are used to manipulate the resulting labels. ``label`` specifies whether the result is labeled with the beginning or @@ -1263,11 +1268,11 @@ labels. .. ipython:: python - ts.resample('5Min') # by default label='right' + ts.resample('5Min').mean() # by default label='right' - ts.resample('5Min', label='left') + ts.resample('5Min', label='left').mean() - ts.resample('5Min', label='left', loffset='1s') + ts.resample('5Min', label='left', loffset='1s').mean() The ``axis`` parameter can be set to 0 or 1 and allows you to resample the specified axis for a DataFrame. @@ -1284,18 +1289,17 @@ frequency periods. Up Sampling ~~~~~~~~~~~ -For upsampling, the ``fill_method`` and ``limit`` parameters can be specified -to interpolate over the gaps that are created: +For upsampling, you can specify a way to upsample and the ``limit`` parameter to interpolate over the gaps that are created: .. ipython:: python # from secondly to every 250 milliseconds - ts[:2].resample('250L') + ts[:2].resample('250L').asfreq() - ts[:2].resample('250L', fill_method='pad') + ts[:2].resample('250L').ffill() - ts[:2].resample('250L', fill_method='pad', limit=2) + ts[:2].resample('250L').ffill(limit=2) Sparse Resampling ~~~~~~~~~~~~~~~~~ @@ -1317,7 +1321,7 @@ If we want to resample to the full range of the series .. ipython:: python - ts.resample('3T',how='sum') + ts.resample('3T').sum() We can instead only resample those groups where we have points as follows: @@ -1333,6 +1337,74 @@ We can instead only resample those groups where we have points as follows: ts.groupby(partial(round, freq='3T')).sum() +Aggregation +~~~~~~~~~~~ + +Similar to :ref:`groupby aggregates ` and the :ref:`window functions `, a ``Resampler`` can be selectively +resampled. + +Resampling a ``DataFrame``, the default will be to act on all columns with the same function. + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(1000, 3), + index=pd.date_range('1/1/2012', freq='S', periods=1000), + columns=['A', 'B', 'C']) + r = df.resample('3T') + r.mean() + +We can select a specific column or columns using standard getitem. + +.. ipython:: python + + r['A'].mean() + + r[['A','B']].mean() + +You can pass a list or dict of functions to do aggregation with, outputting a DataFrame: + +.. ipython:: python + + r['A'].agg([np.sum, np.mean, np.std]) + +If a dict is passed, the keys will be used to name the columns. Otherwise the +function's name (stored in the function object) will be used. + +.. ipython:: python + + r['A'].agg({'result1' : np.sum, + 'result2' : np.mean}) + +On a resampled DataFrame, you can pass a list of functions to apply to each +column, which produces an aggregated result with a hierarchical index: + +.. ipython:: python + + r.agg([np.sum, np.mean]) + +By passing a dict to ``aggregate`` you can apply a different aggregation to the +columns of a DataFrame: + +.. ipython:: python + :okexcept: + + r.agg({'A' : np.sum, + 'B' : lambda x: np.std(x, ddof=1)}) + +The function names can also be strings. In order for a string to be valid it +must be implemented on the Resampled object + +.. ipython:: python + + r.agg({'A' : 'sum', 'B' : 'std'}) + +Furthermore, you can also specify multiple aggregation functions for each column separately. + +.. ipython:: python + + r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + + .. _timeseries.periods: Time Span Representation diff --git a/doc/source/whatsnew/v0.10.0.txt b/doc/source/whatsnew/v0.10.0.txt index f4e7825032ce0..48ce09f32b12b 100644 --- a/doc/source/whatsnew/v0.10.0.txt +++ b/doc/source/whatsnew/v0.10.0.txt @@ -70,16 +70,59 @@ nfrequencies are unaffected. The prior defaults were causing a great deal of confusion for users, especially resampling data to daily frequency (which labeled the aggregated group with the end of the interval: the next day). -Note: - -.. ipython:: python - - dates = pd.date_range('1/1/2000', '1/5/2000', freq='4h') - series = Series(np.arange(len(dates)), index=dates) - series - series.resample('D', how='sum') - # old behavior - series.resample('D', how='sum', closed='right', label='right') +.. code-block:: python + + In [1]: dates = pd.date_range('1/1/2000', '1/5/2000', freq='4h') + + In [2]: series = Series(np.arange(len(dates)), index=dates) + + In [3]: series + Out[3]: + 2000-01-01 00:00:00 0 + 2000-01-01 04:00:00 1 + 2000-01-01 08:00:00 2 + 2000-01-01 12:00:00 3 + 2000-01-01 16:00:00 4 + 2000-01-01 20:00:00 5 + 2000-01-02 00:00:00 6 + 2000-01-02 04:00:00 7 + 2000-01-02 08:00:00 8 + 2000-01-02 12:00:00 9 + 2000-01-02 16:00:00 10 + 2000-01-02 20:00:00 11 + 2000-01-03 00:00:00 12 + 2000-01-03 04:00:00 13 + 2000-01-03 08:00:00 14 + 2000-01-03 12:00:00 15 + 2000-01-03 16:00:00 16 + 2000-01-03 20:00:00 17 + 2000-01-04 00:00:00 18 + 2000-01-04 04:00:00 19 + 2000-01-04 08:00:00 20 + 2000-01-04 12:00:00 21 + 2000-01-04 16:00:00 22 + 2000-01-04 20:00:00 23 + 2000-01-05 00:00:00 24 + Freq: 4H, dtype: int64 + + In [4]: series.resample('D', how='sum') + Out[4]: + 2000-01-01 15 + 2000-01-02 51 + 2000-01-03 87 + 2000-01-04 123 + 2000-01-05 24 + Freq: D, dtype: int64 + + In [5]: # old behavior + In [6]: series.resample('D', how='sum', closed='right', label='right') + Out[6]: + 2000-01-01 0 + 2000-01-02 21 + 2000-01-03 57 + 2000-01-04 93 + 2000-01-05 129 + Freq: D, dtype: int64 - Infinity and negative infinity are no longer treated as NA by ``isnull`` and ``notnull``. That they ever were was a relic of early pandas. This behavior @@ -354,4 +397,3 @@ Adding experimental support for Panel4D and factory functions to create n-dimens See the :ref:`full release notes ` or issue tracker on GitHub for a complete list. - diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 9dca8615af3ae..92fd7a7e9d02a 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -20,6 +20,7 @@ Highlights include: - Window functions are now methods on ``.groupby`` like objects, see :ref:`here `. - ``pd.test()`` top-level nose test runner is available (:issue:`4327`) - Adding support for a ``RangeIndex`` as a specialized form of the ``Int64Index`` for memory savings, see :ref:`here `. +- API breaking ``.resample`` changes to make it more ``.groupby`` like, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -352,6 +353,170 @@ other anchored offsets like ``MonthBegin`` and ``YearBegin``. d = pd.Timestamp('2014-02-15') d + pd.offsets.QuarterBegin(n=0, startingMonth=2) +.. _whatsnew_0180.breaking.resample: + +Resample API +^^^^^^^^^^^^ + +Like the change in the window functions API :ref:`above `, ``.resample(...)`` is changing to have +a more groupby-like API. (:issue:`11732`, :issue:`12702`). + +.. ipython:: python + + np.random.seed(1234) + df = pd.DataFrame(np.random.rand(10,4), + columns=list('ABCD'), + index=pd.date_range('2010-01-01 09:00:00', periods=10, freq='s')) + df + + +**Previous API**: + +You would write a resampling operation that immediately evaluates. + +This defaults to ``how='mean'`` + +.. code-block:: python + + In [6]: df.resample('2s') + Out[6]: + A B C D + 2010-01-01 09:00:00 0.485748 0.447351 0.357096 0.793615 + 2010-01-01 09:00:02 0.820801 0.794317 0.364034 0.531096 + 2010-01-01 09:00:04 0.433985 0.314582 0.424104 0.625733 + 2010-01-01 09:00:06 0.624988 0.609738 0.633165 0.612452 + 2010-01-01 09:00:08 0.510470 0.534317 0.573201 0.806949 + +You could also specify a ``how`` directly + +.. code-block:: python + + In [7]: df.resample('2s',how='sum') + Out[7]: + A B C D + 2010-01-01 09:00:00 0.971495 0.894701 0.714192 1.587231 + 2010-01-01 09:00:02 1.641602 1.588635 0.728068 1.062191 + 2010-01-01 09:00:04 0.867969 0.629165 0.848208 1.251465 + 2010-01-01 09:00:06 1.249976 1.219477 1.266330 1.224904 + 2010-01-01 09:00:08 1.020940 1.068634 1.146402 1.613897 + +.. warning:: + + This new API for resample includes some internal changes that previous API (pre 0.18.0), to work with a deprecation warning in most cases. Since the returned resample is now a deferred object, and not immediately evaluated (as before). We can intercept operations and just do what the (pre 0.18.0) API did (with a warning). Here is a typical use case: + + .. code-block:: python + + In [4]: r = df.resample('2s') + + In [6]: r*10 + pandas/tseries/resample.py:80: FutureWarning: .resample() is now a deferred operation + use .resample(...).mean() instead of .resample(...) + + Out[6]: + A B C D + 2010-01-01 09:00:00 4.857476 4.473507 3.570960 7.936154 + 2010-01-01 09:00:02 8.208011 7.943173 3.640340 5.310957 + 2010-01-01 09:00:04 4.339846 3.145823 4.241039 6.257326 + 2010-01-01 09:00:06 6.249881 6.097384 6.331650 6.124518 + 2010-01-01 09:00:08 5.104699 5.343172 5.732009 8.069486 + + However, getting and assignment operations directly on a ``Resampler`` will raise a ``ValueError``: + + .. code-block:: python + + In [7]: r.iloc[0] = 5 + ValueError: .resample() is now a deferred operation + use .resample(...).mean() instead of .resample(...) + assignment will have no effect as you are working on a copy + +**New API**: + +Now, you write ``.resample`` as a 2-stage operation like groupby, which +yields a ``Resampler``. + +.. ipython:: python + + r = df.resample('2s') + r + +Downsampling +'''''''''''' + +You can then use this object to perform similar operations. +These are downsampling operations (going from a lower frequency to a higher one). + +.. ipython:: python + + r.mean() + +.. ipython:: python + + r.sum() + +Furthermore, resample now supports ``getitem`` operations to perform the resample on specific columns. + +.. ipython:: python + + r[['A','C']].mean() + +and ``.aggregate`` type operations. + +.. ipython:: python + + r.agg({'A' : 'mean', 'B' : 'sum'}) + +These accessors can of course, be combined + +.. ipython:: python + + r[['A','B']].agg(['mean','sum']) + +Upsampling +'''''''''' + +.. currentmodule:: pandas.tseries.resample + +Upsampling operations take you from a higher frequency to a lower frequency. These are now +performed with the ``Resampler`` objects with :meth:`~Resampler.backfill`, +:meth:`~Resampler.ffill`, :meth:`~Resampler.fillna` and :meth:`~Resampler.asfreq` methods. + +.. ipython:: python + + s = Series(np.arange(5,dtype='int64'), + index=date_range('2010-01-01', periods=5, freq='Q')) + s + +Previously + +.. code-block:: python + + In [6]: s.resample('M', fill_method='ffill') + Out[6]: + 2010-03-31 0 + 2010-04-30 0 + 2010-05-31 0 + 2010-06-30 1 + 2010-07-31 1 + 2010-08-31 1 + 2010-09-30 2 + 2010-10-31 2 + 2010-11-30 2 + 2010-12-31 3 + 2011-01-31 3 + 2011-02-28 3 + 2011-03-31 4 + Freq: M, dtype: int64 + +New API + +.. ipython:: python + + s.resample('M').ffill() + +.. note:: + + In the new API, you can either downsample OR upsample. The prior implementation would allow you to pass an aggregator function (like ``mean``) even though you were upsampling, providing a bit of confusion. + Changes to eval ^^^^^^^^^^^^^^^ @@ -435,6 +600,9 @@ Other API Changes - ``DataFrame.unstack`` and ``Series.unstack`` now take ``fill_value`` keyword to allow direct replacement of missing values when an unstack results in missing values in the resulting ``DataFrame``. As an added benefit, specifying ``fill_value`` will preserve the data type of the original stacked data. (:issue:`9746`) +- As part of the new API for :ref:`window functions ` and :ref:`resampling `, aggregation functions have been +clarified, raising more informative error messages on invalid aggregations. (:issue:`9052`). A full set of examples are presented in :ref:`groupby `. + .. _whatsnew_0180.deprecations: Deprecations diff --git a/doc/source/whatsnew/v0.9.1.txt b/doc/source/whatsnew/v0.9.1.txt index ce7439b8ecd92..c803e063da843 100644 --- a/doc/source/whatsnew/v0.9.1.txt +++ b/doc/source/whatsnew/v0.9.1.txt @@ -112,14 +112,21 @@ API changes - Upsampling data with a PeriodIndex will result in a higher frequency TimeSeries that spans the original time window - .. ipython:: python - - prng = period_range('2012Q1', periods=2, freq='Q') + .. code-block:: python - s = Series(np.random.randn(len(prng)), prng) + In [1]: prng = period_range('2012Q1', periods=2, freq='Q') - s.resample('M') + In [2]: s = Series(np.random.randn(len(prng)), prng) + In [4]: s.resample('M') + Out[4]: + 2012-01 -1.471992 + 2012-02 NaN + 2012-03 NaN + 2012-04 -0.493593 + 2012-05 NaN + 2012-06 NaN + Freq: M, dtype: float64 - Period.end_time now returns the last nanosecond in the time interval (:issue:`2124`, :issue:`2125`, :issue:`1764`) diff --git a/pandas/core/base.py b/pandas/core/base.py index ec09482bb27c8..168310b6d7da0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -302,6 +302,10 @@ def _selected_obj(self): else: return self.obj[self._selection] + @cache_readonly + def ndim(self): + return self._selected_obj.ndim + @cache_readonly def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, @@ -412,43 +416,140 @@ def _aggregate(self, arg, *args, **kwargs): None if not required """ + is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + is_nested_renamer = False + _level = kwargs.pop('_level', None) if isinstance(arg, compat.string_types): return getattr(self, arg)(*args, **kwargs), None - result = compat.OrderedDict() if isinstance(arg, dict): + + # aggregate based on the passed dict if self.axis != 0: # pragma: no cover raise ValueError('Can only pass dict with axis=0') obj = self._selected_obj - if any(isinstance(x, (list, tuple, dict)) for x in arg.values()): + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + if any(is_aggregator(x) for x in compat.itervalues(arg)): new_arg = compat.OrderedDict() for k, v in compat.iteritems(arg): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v - arg = new_arg - keys = [] - if self._selection is not None: - subset = obj + # the keys must be in the columns + # for ndim=2, or renamers for ndim=1 + + # ok + # {'A': { 'ra': 'mean' }} + # {'A': { 'ra': ['mean'] }} + # {'ra': ['mean']} + + # not ok + # {'ra' : { 'A' : 'mean' }} + if isinstance(v, dict): + is_nested_renamer = True + + if k not in obj.columns: + raise SpecificationError('cannot perform renaming ' + 'for {0} with a nested ' + 'dictionary'.format(k)) + arg = new_arg + + from pandas.tools.merge import concat + + def _agg_1dim(name, how, subset=None): + """ + aggregate a 1-dim with how + """ + colg = self._gotitem(name, ndim=1, subset=subset) + if colg.ndim != 1: + raise SpecificationError("nested dictionary is ambiguous " + "in aggregation") + return colg.aggregate(how, _level=(_level or 0) + 1) + + def _agg_2dim(name, how): + """ + aggregate a 2-dim with how + """ + colg = self._gotitem(self._selection, ndim=2, + subset=obj) + return colg.aggregate(how, _level=None) + + def _agg(arg, func): + """ + run the aggregations over the arg with func + return an OrderedDict + """ + result = compat.OrderedDict() for fname, agg_how in compat.iteritems(arg): - colg = self._gotitem(self._selection, ndim=1, - subset=subset) - result[fname] = colg.aggregate(agg_how, _level=None) - keys.append(fname) + result[fname] = func(fname, agg_how) + return result + + # set the final keys + keys = list(compat.iterkeys(arg)) + result = compat.OrderedDict() + + # nested renamer + if is_nested_renamer: + result = list(_agg(arg, _agg_1dim).values()) + + if all(isinstance(r, dict) for r in result): + + result, results = compat.OrderedDict(), result + for r in results: + result.update(r) + keys = list(compat.iterkeys(result)) + + else: + + if self._selection is not None: + keys = None + + # some selection on the object + elif self._selection is not None: + + sl = set(self._selection_list) + + # we are a Series like object, + # but may have multiple aggregations + if len(sl) == 1: + + result = _agg(arg, lambda fname, + agg_how: _agg_1dim(self._selection, agg_how)) + + # we are selecting the same set as we are aggregating + elif not len(sl - set(compat.iterkeys(arg))): + + result = _agg(arg, _agg_1dim) + + # we are a DataFrame, with possibly multiple aggregations + else: + + result = _agg(arg, _agg_2dim) + + # no selection else: - for col, agg_how in compat.iteritems(arg): - colg = self._gotitem(col, ndim=1) - result[col] = colg.aggregate(agg_how, _level=None) - keys.append(col) - if isinstance(list(result.values())[0], com.ABCDataFrame): - from pandas.tools.merge import concat + try: + result = _agg(arg, _agg_1dim) + except SpecificationError: + + # we are aggregating expecting all 1d-returns + # but we have 2d + result = _agg(arg, _agg_2dim) + + # combine results + if isinstance(result, list): + result = concat(result, keys=keys, axis=1) + elif isinstance(list(compat.itervalues(result))[0], + com.ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame @@ -508,11 +609,7 @@ def _aggregate_multiple_funcs(self, arg, _level): except SpecificationError: raise - if _level: - keys = None - result = concat(results, keys=keys, axis=1) - - return result + return concat(results, keys=keys, axis=1) def _is_cython_func(self, arg): """ if we define an internal function for this argument, return it """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bd19f2c2302d5..ce156232ed698 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3557,22 +3557,14 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, ---------- rule : string the offset string or object representing target conversion - how : string - method for down- or re-sampling, default to 'mean' for - downsampling axis : int, optional, default 0 - fill_method : string, default None - fill_method for upsampling closed : {'right', 'left'} Which side of bin interval is closed label : {'right', 'left'} Which bin edge label to label bucket with convention : {'start', 'end', 's', 'e'} - kind : "period"/"timestamp" loffset : timedelta Adjust the resampled time labels - limit : int, default None - Maximum size gap to when reindexing with fill_method base : int, default 0 For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could @@ -3601,7 +3593,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, Downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin. - >>> series.resample('3T', how='sum') + >>> series.resample('3T').sum() 2000-01-01 00:00:00 3 2000-01-01 00:03:00 12 2000-01-01 00:06:00 21 @@ -3617,7 +3609,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, To include this value close the right side of the bin interval as illustrated in the example below this one. - >>> series.resample('3T', how='sum', label='right') + >>> series.resample('3T', label='right').sum() 2000-01-01 00:03:00 3 2000-01-01 00:06:00 12 2000-01-01 00:09:00 21 @@ -3626,7 +3618,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - >>> series.resample('3T', how='sum', label='right', closed='right') + >>> series.resample('3T', label='right', closed='right').sum() 2000-01-01 00:00:00 0 2000-01-01 00:03:00 6 2000-01-01 00:06:00 15 @@ -3635,7 +3627,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, Upsample the series into 30 second bins. - >>> series.resample('30S')[0:5] #select first 5 rows + >>> series.resample('30S').asfreq()[0:5] #select first 5 rows 2000-01-01 00:00:00 0 2000-01-01 00:00:30 NaN 2000-01-01 00:01:00 1 @@ -3646,7 +3638,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, Upsample the series into 30 second bins and fill the ``NaN`` values using the ``pad`` method. - >>> series.resample('30S', fill_method='pad')[0:5] + >>> series.resample('30S').pad()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 0 2000-01-01 00:01:00 1 @@ -3657,7 +3649,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, Upsample the series into 30 second bins and fill the ``NaN`` values using the ``bfill`` method. - >>> series.resample('30S', fill_method='bfill')[0:5] + >>> series.resample('30S').bfill()[0:5] 2000-01-01 00:00:00 0 2000-01-01 00:00:30 1 2000-01-01 00:01:00 1 @@ -3665,26 +3657,69 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, 2000-01-01 00:02:00 2 Freq: 30S, dtype: int64 - Pass a custom function to ``how``. + Pass a custom function via ``apply`` >>> def custom_resampler(array_like): ... return np.sum(array_like)+5 - >>> series.resample('3T', how=custom_resampler) + >>> series.resample('3T').apply(custom_resampler) 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 2000-01-01 00:06:00 26 Freq: 3T, dtype: int64 """ + from pandas.tseries.resample import resample - from pandas.tseries.resample import TimeGrouper axis = self._get_axis_number(axis) - sampler = TimeGrouper(rule, label=label, closed=closed, how=how, - axis=axis, kind=kind, loffset=loffset, - fill_method=fill_method, convention=convention, - limit=limit, base=base) - return sampler.resample(self).__finalize__(self) + r = resample(self, freq=rule, label=label, closed=closed, + axis=axis, kind=kind, loffset=loffset, + fill_method=fill_method, convention=convention, + limit=limit, base=base) + + # deprecation warnings + # but call methods anyhow + + if how is not None: + + # .resample(..., how='sum') + if isinstance(how, compat.string_types): + method = "{0}()".format(how) + + # .resample(..., how=lambda x: ....) + else: + method = ".apply()" + + # if we have both a how and fill_method, then show + # the following warning + if fill_method is None: + warnings.warn("how in .resample() is deprecated\n" + "the new syntax is " + ".resample(...).{method}".format( + method=method), + FutureWarning, stacklevel=2) + r = r.aggregate(how) + + if fill_method is not None: + + # show the prior function call + method = '.' + method if how is not None else '' + + args = "limit={0}".format(limit) if limit is not None else "" + warnings.warn("fill_method is deprecated to .resample()\n" + "the new syntax is .resample(...){method}" + ".{fill_method}({args})".format( + method=method, + fill_method=fill_method, + args=args), + FutureWarning, stacklevel=2) + + if how is not None: + r = getattr(r, fill_method)(limit=limit) + else: + r = r.aggregate(fill_method, limit=limit) + + return r def first(self, offset): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 27d1c60e0547a..fbc25e7fdb98d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -7,22 +7,23 @@ import copy from pandas.compat import( - zip, builtins, range, long, lzip, - OrderedDict, callable, filter, map + zip, range, long, lzip, + callable, map ) from pandas import compat -from pandas.core.base import PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError +from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, + DataError, SpecificationError) from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.index import Index, MultiIndex, CategoricalIndex, _ensure_index +from pandas.core.index import (Index, MultiIndex, CategoricalIndex, + _ensure_index) from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel -from pandas.util.decorators import (cache_readonly, Substitution, Appender, make_signature, - deprecate_kwarg) -from textwrap import dedent +from pandas.util.decorators import (cache_readonly, Substitution, Appender, + make_signature, deprecate_kwarg) import pandas.core.algorithms as algos import pandas.core.common as com from pandas.core.common import(_possibly_downcast_to_dtype, isnull, @@ -79,12 +80,13 @@ _cython_transforms = frozenset(['cumprod', 'cumsum', 'shift']) + def _groupby_function(name, alias, npfunc, numeric_only=True, _convert=False): _local_template = "Compute %(f)s of group values" - @Substitution(name='groupby',f=name) + @Substitution(name='groupby', f=name) @Appender(_doc_template) @Appender(_local_template) def f(self): @@ -134,13 +136,16 @@ def _last(x): class Grouper(object): """ - A Grouper allows the user to specify a groupby instruction for a target object + A Grouper allows the user to specify a groupby instruction for a target + object - This specification will select a column via the key parameter, or if the level and/or - axis parameters are given, a level of the index of the target object. + This specification will select a column via the key parameter, or if the + level and/or axis parameters are given, a level of the index of the target + object. - These are local specifications and will override 'global' settings, that is the parameters - axis and level which are passed to the groupby itself. + These are local specifications and will override 'global' settings, + that is the parameters axis and level which are passed to the groupby + itself. Parameters ---------- @@ -149,8 +154,9 @@ class Grouper(object): level : name/number, defaults to None the level for the target index freq : string / frequency object, defaults to None - This will groupby the specified frequency if the target selection (via key or level) is - a datetime-like object. For full specification of available frequencies, please see + This will groupby the specified frequency if the target selection + (via key or level) is a datetime-like object. For full specification + of available frequencies, please see `here `_. axis : number/name of the axis, defaults to 0 sort : boolean, default to False @@ -191,23 +197,22 @@ def __new__(cls, *args, **kwargs): return super(Grouper, cls).__new__(cls) def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): - self.key=key - self.level=level - self.freq=freq - self.axis=axis - self.sort=sort + self.key = key + self.level = level + self.freq = freq + self.axis = axis + self.sort = sort - self.grouper=None - self.obj=None - self.indexer=None - self.binner=None + self.grouper = None + self.obj = None + self.indexer = None + self.binner = None @property def ax(self): return self.grouper def _get_grouper(self, obj): - """ Parameters ---------- @@ -219,13 +224,16 @@ def _get_grouper(self, obj): """ self._set_grouper(obj) - self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key], axis=self.axis, - level=self.level, sort=self.sort) + self.grouper, exclusions, self.obj = _get_grouper(self.obj, [self.key], + axis=self.axis, + level=self.level, + sort=self.sort) return self.binner, self.grouper, self.obj def _set_grouper(self, obj, sort=False): """ - given an object and the specifications, setup the internal grouper for this particular specification + given an object and the specifications, setup the internal grouper + for this particular specification Parameters ---------- @@ -234,7 +242,8 @@ def _set_grouper(self, obj, sort=False): """ if self.key is not None and self.level is not None: - raise ValueError("The Grouper cannot specify both a key and a level!") + raise ValueError( + "The Grouper cannot specify both a key and a level!") # the key must be a valid info item if self.key is not None: @@ -252,17 +261,20 @@ def _set_grouper(self, obj, sort=False): # equivalent to the axis name if isinstance(ax, MultiIndex): level = ax._get_level_number(level) - ax = Index(ax.get_level_values(level), name=ax.names[level]) + ax = Index(ax.get_level_values( + level), name=ax.names[level]) else: if level not in (0, ax.name): - raise ValueError("The level {0} is not valid".format(level)) + raise ValueError( + "The level {0} is not valid".format(level)) # possibly sort if (self.sort or sort) and not ax.is_monotonic: indexer = self.indexer = ax.argsort(kind='quicksort') ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis, convert=False, is_copy=False) + obj = obj.take(indexer, axis=self.axis, + convert=False, is_copy=False) self.obj = obj self.grouper = ax @@ -283,6 +295,7 @@ class GroupByPlot(PandasObject): """ Class implementing the .plot attribute for groupby objects """ + def __init__(self, groupby): self._groupby = groupby @@ -300,75 +313,9 @@ def f(self): return attr -class GroupBy(PandasObject, SelectionMixin): - - """ - Class for grouping and aggregating relational data. See aggregate, - transform, and apply functions on this object. - - It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: - - :: - - grouped = groupby(obj, ...) - - Parameters - ---------- - obj : pandas object - axis : int, default 0 - level : int, default None - Level of MultiIndex - groupings : list of Grouping objects - Most users should ignore this - exclusions : array-like, optional - List of columns to exclude - name : string - Most users should ignore this - - Notes - ----- - After grouping, see aggregate, apply, and transform functions. Here are - some other brief notes about usage. When grouping by multiple groups, the - result index will be a MultiIndex (hierarchical) by default. - - Iteration produces (key, group) tuples, i.e. chunking the data by group. So - you can write code like: - - :: - - grouped = obj.groupby(keys, axis=axis) - for key, group in grouped: - # do something with the data - - Function calls on GroupBy, if not specially implemented, "dispatch" to the - grouped data. So if you group a DataFrame and wish to invoke the std() - method on each group, you can simply do: - - :: - - df.groupby(mapper).std() - - rather than - - :: - - df.groupby(mapper).aggregate(np.std) - - You can pass arguments to these "wrapped" functions, too. - - See the online documentation for full exposition on these topics and much - more - - Returns - ------- - **Attributes** - groups : dict - {group name -> group labels} - len(grouped) : int - Number of groups - """ - _apply_whitelist = _common_apply_whitelist +class _GroupBy(PandasObject, SelectionMixin): _group_selection = None + _apply_whitelist = frozenset([]) def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, @@ -408,27 +355,40 @@ def __unicode__(self): # TODO: Better unicode/repr for GroupBy object return object.__repr__(self) + def _assure_grouper(self): + """ + we create the grouper on instantiation + sub-classes may have a different policy + """ + pass + @property def groups(self): """ dict {group name -> group labels} """ + self._assure_grouper() return self.grouper.groups @property def ngroups(self): + self._assure_grouper() return self.grouper.ngroups @property def indices(self): """ dict {group name -> group indices} """ + self._assure_grouper() return self.grouper.indices def _get_indices(self, names): - """ safe get multiple indices, translate keys for datelike to underlying repr """ + """ + safe get multiple indices, translate keys for + datelike to underlying repr + """ def get_converter(s): # possibly convert to the actual key types # in the indices, could be a Timestamp or a np.datetime64 - if isinstance(s, (Timestamp,datetime.datetime)): + if isinstance(s, (Timestamp, datetime.datetime)): return lambda key: Timestamp(key) elif isinstance(s, np.datetime64): return lambda key: Timestamp(key).asm8 @@ -460,7 +420,8 @@ def get_converter(s): raise ValueError(msg) converters = [get_converter(s) for s in index_sample] - names = [tuple([f(n) for f, n in zip(converters, name)]) for name in names] + names = [tuple([f(n) for f, n in zip(converters, name)]) + for name in names] else: converter = get_converter(index_sample) @@ -485,10 +446,11 @@ def _selected_obj(self): def _set_selection_from_grouper(self): """ we may need create a selection if we have non-level groupers """ grp = self.grouper - if self.as_index and getattr(grp,'groupings',None) is not None and self.obj.ndim > 1: + if self.as_index and getattr(grp, 'groupings', None) is not None and \ + self.obj.ndim > 1: ax = self.obj._info_axis groupers = [g.name for g in grp.groupings - if g.level is None and g.in_axis] + if g.level is None and g.in_axis] if len(groupers): self._group_selection = ax.difference(Index(groupers)).tolist() @@ -499,11 +461,10 @@ def _set_result_index_ordered(self, result): # related 8046 # the values/counts are repeated according to the group index - indices = self.indices - # shortcut of we have an already ordered grouper if not self.grouper.is_monotonic: - index = Index(np.concatenate(self._get_indices(self.grouper.result_index))) + index = Index(np.concatenate( + self._get_indices(self.grouper.result_index))) result.index = index result = result.sort_index() @@ -549,7 +510,8 @@ def wrapper(*args, **kwargs): # a little trickery for aggregation functions that need an axis # argument kwargs_with_axis = kwargs.copy() - if 'axis' not in kwargs_with_axis or kwargs_with_axis['axis'] is None: + if 'axis' not in kwargs_with_axis or \ + kwargs_with_axis['axis'] is None: kwargs_with_axis['axis'] = self.axis def curried_with_axis(x): @@ -576,11 +538,13 @@ def curried(x): # related to : GH3688 # try item-by-item - # this can be called recursively, so need to raise ValueError if - # we don't have this method to indicated to aggregate to + # this can be called recursively, so need to raise + # ValueError + # if we don't have this method to indicated to aggregate to # mark this column as an error try: - return self._aggregate_item_by_item(name, *args, **kwargs) + return self._aggregate_item_by_item(name, + *args, **kwargs) except (AttributeError): raise ValueError @@ -659,35 +623,286 @@ def apply(self, func, *args, **kwargs): side-effects, as they will take effect twice for the first group. + See also -------- - aggregate, transform - pandas.Series.%(name)s - pandas.DataFrame.%(name)s - pandas.Panel.%(name)s""" + aggregate, transform""" + + func = self._is_builtin_func(func) + + @wraps(func) + def f(g): + return func(g, *args, **kwargs) + + # ignore SettingWithCopy here in case the user mutates + with option_context('mode.chained_assignment', None): + return self._python_apply_general(f) + + def _python_apply_general(self, f): + keys, values, mutated = self.grouper.apply(f, self._selected_obj, + self.axis) + + return self._wrap_applied_output(keys, values, + not_indexed_same=mutated) + + def _iterate_slices(self): + yield self.name, self._selected_obj + + def transform(self, func, *args, **kwargs): + raise AbstractMethodError(self) + + def _cumcount_array(self, arr=None, ascending=True): + """ + arr is where cumcount gets its values from + + Note + ---- + this is currently implementing sort=False + (though the default is sort=True) for groupby in general + """ + if arr is None: + arr = np.arange(self.grouper._max_groupsize, dtype='int64') + + len_index = len(self._selected_obj.index) + cumcounts = np.zeros(len_index, dtype=arr.dtype) + if not len_index: + return cumcounts + + indices, values = [], [] + for v in self.indices.values(): + indices.append(v) + + if ascending: + values.append(arr[:len(v)]) + else: + values.append(arr[len(v) - 1::-1]) + + indices = np.concatenate(indices) + values = np.concatenate(values) + cumcounts[indices] = values + + return cumcounts + + def _index_with_as_index(self, b): + """ + Take boolean mask of index to be returned from apply, if as_index=True + + """ + # TODO perf, it feels like this should already be somewhere... + from itertools import chain + original = self._selected_obj.index + gp = self.grouper + levels = chain((gp.levels[i][gp.labels[i][b]] + for i in range(len(gp.groupings))), + (original.get_level_values(i)[b] + for i in range(original.nlevels))) + new = MultiIndex.from_arrays(list(levels)) + new.names = gp.names + original.names + return new + + def _try_cast(self, result, obj): + """ + try to cast the result to our obj original type, + we may have roundtripped thru object in the mean-time + + """ + if obj.ndim > 1: + dtype = obj.values.dtype + else: + dtype = obj.dtype + + if not np.isscalar(result): + result = _possibly_downcast_to_dtype(result, dtype) + + return result + + def _cython_transform(self, how, numeric_only=True): + output = {} + for name, obj in self._iterate_slices(): + is_numeric = is_numeric_dtype(obj.dtype) + if numeric_only and not is_numeric: + continue + + try: + result, names = self.grouper.transform(obj.values, how) + except AssertionError as e: + raise GroupByError(str(e)) + output[name] = self._try_cast(result, obj) + + if len(output) == 0: + raise DataError('No numeric types to aggregate') + + return self._wrap_transformed_output(output, names) + + def _cython_agg_general(self, how, numeric_only=True): + output = {} + for name, obj in self._iterate_slices(): + is_numeric = is_numeric_dtype(obj.dtype) + if numeric_only and not is_numeric: + continue + + try: + result, names = self.grouper.aggregate(obj.values, how) + except AssertionError as e: + raise GroupByError(str(e)) + output[name] = self._try_cast(result, obj) + + if len(output) == 0: + raise DataError('No numeric types to aggregate') + + return self._wrap_aggregated_output(output, names) + + def _python_agg_general(self, func, *args, **kwargs): + func = self._is_builtin_func(func) + f = lambda x: func(x, *args, **kwargs) + + # iterate through "columns" ex exclusions to populate output dict + output = {} + for name, obj in self._iterate_slices(): + try: + result, counts = self.grouper.agg_series(obj, f) + output[name] = self._try_cast(result, obj) + except TypeError: + continue + + if len(output) == 0: + return self._python_apply_general(f) + + if self.grouper._filter_empty_groups: + + mask = counts.ravel() > 0 + for name, result in compat.iteritems(output): + + # since we are masking, make sure that we have a float object + values = result + if is_numeric_dtype(values.dtype): + values = com.ensure_float(values) + + output[name] = self._try_cast(values[mask], result) + + return self._wrap_aggregated_output(output) + + def _wrap_applied_output(self, *args, **kwargs): + raise AbstractMethodError(self) + + def _concat_objects(self, keys, values, not_indexed_same=False): + from pandas.tools.merge import concat + + if not not_indexed_same: + result = concat(values, axis=self.axis) + ax = self._selected_obj._get_axis(self.axis) + + if isinstance(result, Series): + result = result.reindex(ax) + else: + result = result.reindex_axis(ax, axis=self.axis) + + elif self.group_keys: + + if self.as_index: + + # possible MI return case + group_keys = keys + group_levels = self.grouper.levels + group_names = self.grouper.names + result = concat(values, axis=self.axis, keys=group_keys, + levels=group_levels, names=group_names) + else: + + # GH5610, returns a MI, with the first level being a + # range index + keys = list(range(len(values))) + result = concat(values, axis=self.axis, keys=keys) + else: + result = concat(values, axis=self.axis) + + return result + + def _apply_filter(self, indices, dropna): + if len(indices) == 0: + indices = [] + else: + indices = np.sort(np.concatenate(indices)) + if dropna: + filtered = self._selected_obj.take(indices, axis=self.axis) + else: + mask = np.empty(len(self._selected_obj.index), dtype=bool) + mask.fill(False) + mask[indices.astype(int)] = True + # mask fails to broadcast when passed to where; broadcast manually. + mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T + filtered = self._selected_obj.where(mask) # Fill with NaNs. + return filtered + + +class GroupBy(_GroupBy): + + """ + Class for grouping and aggregating relational data. See aggregate, + transform, and apply functions on this object. + + It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: + + :: + + grouped = groupby(obj, ...) + + Parameters + ---------- + obj : pandas object + axis : int, default 0 + level : int, default None + Level of MultiIndex + groupings : list of Grouping objects + Most users should ignore this + exclusions : array-like, optional + List of columns to exclude + name : string + Most users should ignore this + + Notes + ----- + After grouping, see aggregate, apply, and transform functions. Here are + some other brief notes about usage. When grouping by multiple groups, the + result index will be a MultiIndex (hierarchical) by default. + + Iteration produces (key, group) tuples, i.e. chunking the data by group. So + you can write code like: + + :: + + grouped = obj.groupby(keys, axis=axis) + for key, group in grouped: + # do something with the data + + Function calls on GroupBy, if not specially implemented, "dispatch" to the + grouped data. So if you group a DataFrame and wish to invoke the std() + method on each group, you can simply do: - func = self._is_builtin_func(func) + :: - @wraps(func) - def f(g): - return func(g, *args, **kwargs) + df.groupby(mapper).std() - # ignore SettingWithCopy here in case the user mutates - with option_context('mode.chained_assignment',None): - return self._python_apply_general(f) + rather than - def _python_apply_general(self, f): - keys, values, mutated = self.grouper.apply(f, self._selected_obj, - self.axis) + :: - return self._wrap_applied_output(keys, values, - not_indexed_same=mutated) + df.groupby(mapper).aggregate(np.std) - def _iterate_slices(self): - yield self.name, self._selected_obj + You can pass arguments to these "wrapped" functions, too. - def transform(self, func, *args, **kwargs): - raise AbstractMethodError(self) + See the online documentation for full exposition on these topics and much + more + + Returns + ------- + **Attributes** + groups : dict + {group name -> group labels} + len(grouped) : int + Number of groups + """ + _apply_whitelist = _common_apply_whitelist def irow(self, i): """ @@ -739,6 +954,7 @@ def median(self): except Exception: # pragma: no cover self._set_selection_from_grouper() + def f(x): if isinstance(x, np.ndarray): x = Series(x) @@ -797,7 +1013,7 @@ def sem(self, ddof=1): degrees of freedom """ - return self.std(ddof=ddof)/np.sqrt(self.count()) + return self.std(ddof=ddof) / np.sqrt(self.count()) @Substitution(name='groupby') @Appender(_doc_template) @@ -825,6 +1041,30 @@ def ohlc(self): return self._apply_to_column_groupbys( lambda x: x._cython_agg_general('ohlc')) + @Substitution(name='groupby') + @Appender(_doc_template) + def resample(self, rule, **kwargs): + """ + Provide resampling when using a TimeGrouper + Return a new grouper with our resampler appended + """ + from pandas.tseries.resample import TimeGrouper + gpr = TimeGrouper(axis=self.axis, freq=rule, **kwargs) + + # we by definition have at least 1 key as we are already a grouper + groupings = list(self.grouper.groupings) + groupings.append(gpr) + + return self.__class__(self.obj, + keys=groupings, + axis=self.axis, + level=self.level, + as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + selection=self._selection) + @Substitution(name='groupby') @Appender(_doc_template) def nth(self, n, dropna=None): @@ -833,8 +1073,9 @@ def nth(self, n, dropna=None): if n is a list of ints. If dropna, will take the nth non-null row, dropna is either - Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent - to calling dropna(how=dropna) before the groupby. + Truthy (if a Series) or 'all', 'any' (if a DataFrame); + this is equivalent to calling dropna(how=dropna) before the + groupby. Parameters ---------- @@ -864,7 +1105,9 @@ def nth(self, n, dropna=None): A 1 4 5 6 - >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + + # NaNs denote group exhausted when using dropna + >>> g.nth(1, dropna='any') B A 1 NaN @@ -876,7 +1119,8 @@ def nth(self, n, dropna=None): elif isinstance(n, (set, list, tuple)): nth_values = list(set(n)) if dropna is not None: - raise ValueError("dropna option with a list of nth values is not supported") + raise ValueError( + "dropna option with a list of nth values is not supported") else: raise TypeError("n needs to be an int or a list/set/tuple of ints") @@ -911,8 +1155,10 @@ def nth(self, n, dropna=None): if self.obj.ndim == 1: # this is a pass-thru pass - elif all([ n in ax for n in names ]): - result.index = MultiIndex.from_arrays([self.obj[name][is_nth] for name in names]).set_names(names) + elif all([x in ax for x in names]): + indicies = [self.obj[name][is_nth] for name in names] + result.index = MultiIndex.from_arrays( + indicies).set_names(names) elif self._group_selection is not None: result.index = self.obj._get_axis(self.axis)[is_nth] @@ -920,8 +1166,8 @@ def nth(self, n, dropna=None): return result - if (isinstance(self._selected_obj, DataFrame) - and dropna not in ['any', 'all']): + if isinstance(self._selected_obj, DataFrame) and \ + dropna not in ['any', 'all']: # Note: when agg-ing picker doesn't raise this, just returns NaN raise ValueError("For a DataFrame groupby, dropna must be " "either None, 'any' or 'all', " @@ -935,27 +1181,31 @@ def nth(self, n, dropna=None): # get a new grouper for our dropped obj if self.keys is None and self.level is None: - # we don't have the grouper info available (e.g. we have selected out + # we don't have the grouper info available + # (e.g. we have selected out # a column that is not in the current object) axis = self.grouper.axis grouper = axis[axis.isin(dropped.index)] - keys = self.grouper.names + else: - # create a grouper with the original parameters, but on the dropped object - grouper, _, _ = _get_grouper(dropped, key=self.keys, axis=self.axis, - level=self.level, sort=self.sort) + # create a grouper with the original parameters, but on the dropped + # object + grouper, _, _ = _get_grouper(dropped, key=self.keys, + axis=self.axis, level=self.level, + sort=self.sort) sizes = dropped.groupby(grouper).size() result = dropped.groupby(grouper).nth(n) - mask = (sizes 1: - dtype = obj.values.dtype - else: - dtype = obj.dtype - - if not np.isscalar(result): - result = _possibly_downcast_to_dtype(result, dtype) - - return result - - def _cython_transform(self, how, numeric_only=True): - output = {} - for name, obj in self._iterate_slices(): - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - continue - - try: - result, names = self.grouper.transform(obj.values, how) - except AssertionError as e: - raise GroupByError(str(e)) - output[name] = self._try_cast(result, obj) - - if len(output) == 0: - raise DataError('No numeric types to aggregate') - - return self._wrap_transformed_output(output, names) - - def _cython_agg_general(self, how, numeric_only=True): - output = {} - for name, obj in self._iterate_slices(): - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - continue - - try: - result, names = self.grouper.aggregate(obj.values, how) - except AssertionError as e: - raise GroupByError(str(e)) - output[name] = self._try_cast(result, obj) - - if len(output) == 0: - raise DataError('No numeric types to aggregate') - - return self._wrap_aggregated_output(output, names) - - def _python_agg_general(self, func, *args, **kwargs): - func = self._is_builtin_func(func) - f = lambda x: func(x, *args, **kwargs) - - # iterate through "columns" ex exclusions to populate output dict - output = {} - for name, obj in self._iterate_slices(): - try: - result, counts = self.grouper.agg_series(obj, f) - output[name] = self._try_cast(result, obj) - except TypeError: - continue - - if len(output) == 0: - return self._python_apply_general(f) - - if self.grouper._filter_empty_groups: - - mask = counts.ravel() > 0 - for name, result in compat.iteritems(output): - - # since we are masking, make sure that we have a float object - values = result - if is_numeric_dtype(values.dtype): - values = com.ensure_float(values) - - output[name] = self._try_cast(values[mask], result) - - return self._wrap_aggregated_output(output) - - def _wrap_applied_output(self, *args, **kwargs): - raise AbstractMethodError(self) - - def _concat_objects(self, keys, values, not_indexed_same=False): - from pandas.tools.merge import concat - - if not not_indexed_same: - result = concat(values, axis=self.axis) - ax = self._selected_obj._get_axis(self.axis) - - if isinstance(result, Series): - result = result.reindex(ax) - else: - result = result.reindex_axis(ax, axis=self.axis) - - elif self.group_keys: - - if self.as_index: - - # possible MI return case - group_keys = keys - group_levels = self.grouper.levels - group_names = self.grouper.names - result = concat(values, axis=self.axis, keys=group_keys, - levels=group_levels, names=group_names) - else: - - # GH5610, returns a MI, with the first level being a - # range index - keys = list(range(len(values))) - result = concat(values, axis=self.axis, keys=keys) - else: - result = concat(values, axis=self.axis) - - return result - - def _apply_filter(self, indices, dropna): - if len(indices) == 0: - indices = [] - else: - indices = np.sort(np.concatenate(indices)) - if dropna: - filtered = self._selected_obj.take(indices, axis=self.axis) - else: - mask = np.empty(len(self._selected_obj.index), dtype=bool) - mask.fill(False) - mask[indices.astype(int)] = True - # mask fails to broadcast when passed to where; broadcast manually. - mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T - filtered = self._selected_obj.where(mask) # Fill with NaNs. - return filtered - @Appender(GroupBy.__doc__) def groupby(obj, by, **kwds): @@ -1335,13 +1403,14 @@ def _is_indexed_like(obj, axes): class BaseGrouper(object): """ - This is an internal Grouper class, which actually holds the generated groups + This is an internal Grouper class, which actually holds + the generated groups """ def __init__(self, axis, groupings, sort=True, group_keys=True): self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis, self.groupings, self.sort, self.group_keys = \ - axis, groupings, sort, group_keys + axis, groupings, sort, group_keys @property def shape(self): @@ -1397,7 +1466,7 @@ def apply(self, f, data, axis=0): # we detect a mutation of some kind # so take slow path pass - except (Exception) as e: + except Exception: # raise this error to the caller pass @@ -1421,7 +1490,8 @@ def indices(self): return self.groupings[0].indices else: label_list = [ping.labels for ping in self.groupings] - keys = [_values_from_object(ping.group_index) for ping in self.groupings] + keys = [_values_from_object(ping.group_index) + for ping in self.groupings] return _get_indices_dict(label_list, keys) @property @@ -1482,7 +1552,6 @@ def group_info(self): comp_ids = com._ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups - def _get_compressed_labels(self): all_labels = [ping.labels for ping in self.groupings] if len(all_labels) > 1: @@ -1502,7 +1571,7 @@ def recons_labels(self): comp_ids, obs_ids, _ = self.group_info labels = (ping.labels for ping in self.groupings) return decons_obs_group_ids(comp_ids, - obs_ids, self.shape, labels, xnull=True) + obs_ids, self.shape, labels, xnull=True) @cache_readonly def result_index(self): @@ -1527,7 +1596,7 @@ def get_group_levels(self): return name_list - #------------------------------------------------------------ + # ------------------------------------------------------------ # Aggregation functions _cython_functions = { @@ -1546,19 +1615,22 @@ def get_group_levels(self): 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) }, 'last': 'group_last', - }, + 'ohlc': 'group_ohlc', + }, 'transform': { - 'cumprod' : 'group_cumprod', - 'cumsum' : 'group_cumsum', - } + 'cumprod': 'group_cumprod', + 'cumsum': 'group_cumsum', + } } _cython_arity = { 'ohlc': 4, # OHLC } - _name_functions = {} + _name_functions = { + 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] + } def _get_cython_function(self, kind, how, values, is_numeric): @@ -1635,11 +1707,13 @@ def _cython_operation(self, kind, values, how, axis): values = values.astype(object) try: - func, dtype_str = self._get_cython_function(kind, how, values, is_numeric) + func, dtype_str = self._get_cython_function( + kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = _algos.ensure_float64(values) - func, dtype_str = self._get_cython_function(kind, how, values, is_numeric) + func, dtype_str = self._get_cython_function( + kind, how, values, is_numeric) else: raise @@ -1654,28 +1728,31 @@ def _cython_operation(self, kind, values, how, axis): result = np.empty(out_shape, dtype=out_dtype) result.fill(np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) - result = self._aggregate(result, counts, values, labels, func, is_numeric) + result = self._aggregate( + result, counts, values, labels, func, is_numeric) elif kind == 'transform': result = np.empty_like(values, dtype=out_dtype) result.fill(np.nan) # temporary storange for running-total type tranforms accum = np.empty(out_shape, dtype=out_dtype) - result = self._transform(result, accum, values, labels, func, is_numeric) + result = self._transform( + result, accum, values, labels, func, is_numeric) if com.is_integer_dtype(result): if len(result[result == tslib.iNaT]) > 0: result = result.astype('float64') result[result == tslib.iNaT] = np.nan - if kind == 'aggregate' and self._filter_empty_groups and not counts.all(): + if kind == 'aggregate' and \ + self._filter_empty_groups and not counts.all(): if result.ndim == 2: try: result = lib.row_bool_subset( result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( - com._ensure_object(result), - (counts > 0).view(np.uint8)) + com._ensure_object(result), + (counts > 0).view(np.uint8)) else: result = result[counts > 0] @@ -1699,7 +1776,8 @@ def aggregate(self, values, how, axis=0): def transform(self, values, how, axis=0): return self._cython_operation('transform', values, how, axis) - def _aggregate(self, result, counts, values, comp_ids, agg_func, is_numeric): + def _aggregate(self, result, counts, values, comp_ids, agg_func, + is_numeric): if values.ndim > 3: # punting for now raise NotImplementedError("number of dimensions is currently " @@ -1714,7 +1792,8 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, is_numeric): return result - def _transform(self, result, accum, values, comp_ids, transform_func, is_numeric): + def _transform(self, result, accum, values, comp_ids, transform_func, + is_numeric): comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now @@ -1724,7 +1803,8 @@ def _transform(self, result, accum, values, comp_ids, transform_func, is_numeric for i, chunk in enumerate(values.transpose(2, 0, 1)): chunk = chunk.squeeze() - agg_func(result[:, :, i], values, comp_ids, accum) + transform_func(result[:, :, i], values, + comp_ids, accum) else: transform_func(result, values, comp_ids, accum) @@ -1829,6 +1909,7 @@ def generate_bins_generic(values, binner, closed): return bins + class BinGrouper(BaseGrouper): def __init__(self, bins, binlabels, filter_empty=False): @@ -1862,20 +1943,21 @@ def get_iterator(self, data, axis=0): for each group """ if isinstance(data, NDFrame): - slicer = lambda start,edge: data._slice(slice(start,edge),axis=axis) + slicer = lambda start, edge: data._slice( + slice(start, edge), axis=axis) length = len(data.axes[axis]) else: - slicer = lambda start,edge: data[slice(start,edge)] + slicer = lambda start, edge: data[slice(start, edge)] length = len(data) start = 0 for edge, label in zip(self.bins, self.binlabels): if label is not tslib.NaT: - yield label, slicer(start,edge) + yield label, slicer(start, edge) start = edge if start < length: - yield self.binlabels[-1], slicer(start,None) + yield self.binlabels[-1], slicer(start, None) def apply(self, f, data, axis=0): result_keys = [] @@ -1947,22 +2029,17 @@ def groupings(self): # for compat return None - #---------------------------------------------------------------------- - # cython aggregation - - _cython_functions = copy.deepcopy(BaseGrouper._cython_functions) - _cython_functions['aggregate']['ohlc'] = 'group_ohlc' - _cython_functions['aggregate'].pop('median') - - _name_functions = { - 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] - } - def agg_series(self, obj, func): dummy = obj[:0] grouper = lib.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() + # ---------------------------------------------------------------------- + # cython aggregation + + _cython_functions = copy.deepcopy(BaseGrouper._cython_functions) + _cython_functions['aggregate'].pop('median') + class Grouping(object): @@ -2011,9 +2088,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # pre-computed self._should_compress = True - # we have a single grouper which may be a myriad of things, some of which are - # dependent on the passing in level - # + # we have a single grouper which may be a myriad of things, + # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): @@ -2062,24 +2138,29 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if self.sort: if not self.grouper.ordered: - # technically we cannot group on an unordered Categorical + # technically we cannot group on an unordered + # Categorical # but this a user convenience to do so; the ordering - # is preserved and if it's a reduction it doesn't make any difference + # is preserved and if it's a reduction it doesn't make + # any difference pass - # fix bug #GH8868 sort=False being ignored in categorical groupby + # fix bug #GH8868 sort=False being ignored in categorical + # groupby else: cat = self.grouper.unique() - self.grouper = self.grouper.reorder_categories(cat.categories) + self.grouper = self.grouper.reorder_categories( + cat.categories) # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes c = self.grouper.categories - self._group_index = CategoricalIndex(Categorical.from_codes(np.arange(len(c)), - categories=c, - ordered=self.grouper.ordered)) + self._group_index = CategoricalIndex( + Categorical.from_codes(np.arange(len(c)), + categories=c, + ordered=self.grouper.ordered)) # a passed Grouper like elif isinstance(self.grouper, Grouper): @@ -2096,7 +2177,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = self.grouper.grouper # no level passed - elif not isinstance(self.grouper, (Series, Index, Categorical, np.ndarray)): + elif not isinstance(self.grouper, + (Series, Index, Categorical, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError("Grouper for '%s' not 1-dimensional" % t) @@ -2109,8 +2191,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = None # Try for sanity raise AssertionError(errmsg) - # if we have a date/time-like grouper, make sure that we have Timestamps like - if getattr(self.grouper,'dtype',None) is not None: + # if we have a date/time-like grouper, make sure that we have + # Timestamps like + if getattr(self.grouper, 'dtype', None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime self.grouper = to_datetime(self.grouper) @@ -2158,6 +2241,7 @@ def _make_labels(self): def groups(self): return self.index.groupby(self.grouper) + def _get_grouper(obj, key=None, axis=0, level=None, sort=True): """ create and return a BaseGrouper, which is an internal @@ -2273,10 +2357,19 @@ def is_in_obj(gpr): in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != len(obj): - raise ValueError("Categorical dtype grouper must have len(grouper) == len(data)") - - ping = Grouping(group_axis, gpr, obj=obj, name=name, - level=level, sort=sort, in_axis=in_axis) + raise ValueError("Categorical dtype grouper must " + "have len(grouper) == len(data)") + + # create the Grouping + # allow us to passing the actual Grouping as the gpr + ping = Grouping(group_axis, + gpr, + obj=obj, + name=name, + level=level, + sort=sort, + in_axis=in_axis) \ + if not isinstance(gpr, Grouping) else gpr groupings.append(ping) @@ -2308,7 +2401,8 @@ def _convert_grouper(axis, grouper): else: return grouper -def _whitelist_method_generator(klass, whitelist) : + +def _whitelist_method_generator(klass, whitelist): """ Yields all GroupBy member defs for DataFrame/Series names in _whitelist. @@ -2329,50 +2423,52 @@ def _whitelist_method_generator(klass, whitelist) : """ method_wrapper_template = \ - """def %(name)s(%(sig)s) : + """def %(name)s(%(sig)s) : \""" %(doc)s \""" f = %(self)s.__getattr__('%(name)s') return f(%(args)s)""" property_wrapper_template = \ - """@property + """@property def %(name)s(self) : \""" %(doc)s \""" return self.__getattr__('%(name)s')""" - for name in whitelist : + for name in whitelist: # don't override anything that was explicitly defined # in the base class - if hasattr(GroupBy,name) : + if hasattr(GroupBy, name): continue # ugly, but we need the name string itself in the method. - f = getattr(klass,name) + f = getattr(klass, name) doc = f.__doc__ - doc = doc if type(doc)==str else '' - if type(f) == types.MethodType : + doc = doc if type(doc) == str else '' + if isinstance(f, types.MethodType): wrapper_template = method_wrapper_template decl, args = make_signature(f) # pass args by name to f because otherwise # GroupBy._make_wrapper won't know whether # we passed in an axis parameter. args_by_name = ['{0}={0}'.format(arg) for arg in args[1:]] - params = {'name':name, - 'doc':doc, - 'sig':','.join(decl), - 'self':args[0], - 'args':','.join(args_by_name)} - else : + params = {'name': name, + 'doc': doc, + 'sig': ','.join(decl), + 'self': args[0], + 'args': ','.join(args_by_name)} + else: wrapper_template = property_wrapper_template - params = {'name':name, 'doc':doc} + params = {'name': name, 'doc': doc} yield wrapper_template % params + class SeriesGroupBy(GroupBy): # # Make class defs of attributes on SeriesGroupBy whitelist _apply_whitelist = _series_apply_whitelist - for _def_str in _whitelist_method_generator(Series,_series_apply_whitelist) : + for _def_str in _whitelist_method_generator(Series, + _series_apply_whitelist): exec(_def_str) def aggregate(self, func_or_funcs, *args, **kwargs): @@ -2426,12 +2522,12 @@ def aggregate(self, func_or_funcs, *args, **kwargs): ------- Series or DataFrame """ - _level = kwargs.pop('_level',None) + _level = kwargs.pop('_level', None) if isinstance(func_or_funcs, compat.string_types): return getattr(self, func_or_funcs)(*args, **kwargs) if hasattr(func_or_funcs, '__iter__'): - ret = self._aggregate_multiple_funcs(func_or_funcs) + ret = self._aggregate_multiple_funcs(func_or_funcs, _level) else: cyfunc = self._is_cython_func(func_or_funcs) if cyfunc and not args and not kwargs: @@ -2455,7 +2551,7 @@ def aggregate(self, func_or_funcs, *args, **kwargs): agg = aggregate - def _aggregate_multiple_funcs(self, arg): + def _aggregate_multiple_funcs(self, arg, _level): if isinstance(arg, dict): columns = list(arg.keys()) arg = list(arg.items()) @@ -2476,6 +2572,14 @@ def _aggregate_multiple_funcs(self, arg): columns.append(com._get_callable_name(f)) arg = lzip(columns, arg) + # for a ndim=1, disallow a nested dict for an aggregator as + # this is a mis-specification of the aggregations, via a + # specificiation error + # e.g. g['A'].agg({'A': ..., 'B': ...}) + if self.name in columns and len(columns) > 1: + raise SpecificationError('invalid aggregation names specified ' + 'for selected objects') + results = {} for name, func in arg: obj = self @@ -2491,6 +2595,13 @@ def _aggregate_multiple_funcs(self, arg): obj._selection = name results[name] = obj.aggregate(func) + if isinstance(list(compat.itervalues(results))[0], + com.ABCDataFrame): + + # let higher level handle + if _level: + return results + return list(compat.itervalues(results))[0] return DataFrame(results, columns=columns) def _wrap_output(self, output, index, names=None): @@ -2583,7 +2694,8 @@ def transform(self, func, *args, **kwargs): return getattr(self, func)(*args, **kwargs) else: # cythonized aggregation and merge - return self._transform_fast(lambda : getattr(self, func)(*args, **kwargs)) + return self._transform_fast( + lambda: getattr(self, func)(*args, **kwargs)) # reg transform dtype = self._selected_obj.dtype @@ -2615,10 +2727,11 @@ def transform(self, func, *args, **kwargs): def _transform_fast(self, func): """ - fast version of transform, only applicable to builtin/cythonizable functions + fast version of transform, only applicable to + builtin/cythonizable functions """ if isinstance(func, compat.string_types): - func = getattr(self,func) + func = getattr(self, func) ids, _, ngroup = self.grouper.group_info mask = ids != -1 @@ -2634,7 +2747,7 @@ def _transform_fast(self, func): return Series(out, index=self.obj.index) - def filter(self, func, dropna=True, *args, **kwargs): + def filter(self, func, dropna=True, *args, **kwargs): # noqa """ Return a copy of a Series excluding elements from groups that do not satisfy the boolean criterion specified by func. @@ -2676,6 +2789,7 @@ def true_and_notnull(x, *args, **kwargs): return filtered def nunique(self, dropna=True): + """ Returns number of unique elements in the group """ ids, _, _ = self.grouper.group_info val = self.obj.get_values() @@ -2711,15 +2825,16 @@ def nunique(self, dropna=True): index=self.grouper.result_index, name=self.name) - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', + mapping={True: 'last', False: 'first'}) @Appender(Series.nlargest.__doc__) def nlargest(self, n=5, keep='first'): # ToDo: When we remove deprecate_kwargs, we can remote these methods # and include nlargest and nsmallest to _series_apply_whitelist return self.apply(lambda x: x.nlargest(n=n, keep=keep)) - - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', + mapping={True: 'last', False: 'first'}) @Appender(Series.nsmallest.__doc__) def nsmallest(self, n=5, keep='first'): return self.apply(lambda x: x.nsmallest(n=n, keep=keep)) @@ -2764,7 +2879,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, # new values are where sorted labels change inc = np.r_[True, lab[1:] != lab[:-1]] inc[idx] = True # group boundaries are also new values - out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts + out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts # num. of times each group should be repeated rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) @@ -2839,12 +2954,15 @@ def count(self): ids = com._ensure_platform_int(ids) out = np.bincount(ids[mask], minlength=ngroups or None) - return Series(out, index=self.grouper.result_index, name=self.name, dtype='int64') + return Series(out, + index=self.grouper.result_index, + name=self.name, dtype='int64') def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) + class NDFrameGroupBy(GroupBy): def _iterate_slices(self): @@ -2865,7 +2983,8 @@ def _iterate_slices(self): yield val, slicer(val) def _cython_agg_general(self, how, numeric_only=True): - new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) + new_items, new_blocks = self._cython_agg_blocks( + how, numeric_only=numeric_only) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -2901,7 +3020,8 @@ def _cython_agg_blocks(self, how, numeric_only=True): for block in data.blocks: - result, _ = self.grouper.aggregate(block.values, how, axis=agg_axis) + result, _ = self.grouper.aggregate( + block.values, how, axis=agg_axis) # see if we can cast the block back to the original dtype result = block._try_coerce_and_cast_result(result) @@ -2929,7 +3049,7 @@ def _post_process_cython_aggregate(self, obj): def aggregate(self, arg, *args, **kwargs): - _level = kwargs.pop('_level',None) + _level = kwargs.pop('_level', None) result, how = self._aggregate(arg, _level=_level, *args, **kwargs) if how is None: return result @@ -2944,9 +3064,11 @@ def aggregate(self, arg, *args, **kwargs): # try to treat as if we are passing a list try: assert not args and not kwargs - result = self._aggregate_multiple_funcs([arg], _level=_level) - result.columns = Index(result.columns.levels[0], - name=self._selected_obj.columns.name) + result = self._aggregate_multiple_funcs( + [arg], _level=_level) + result.columns = Index( + result.columns.levels[0], + name=self._selected_obj.columns.name) except: result = self._aggregate_generic(arg, *args, **kwargs) @@ -2994,7 +3116,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): obj = self._obj_with_exclusions result = {} cannot_agg = [] - errors=None + errors = None for item in obj: try: data = obj[item] @@ -3007,7 +3129,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): continue except TypeError as e: cannot_agg.append(item) - errors=e + errors = e continue result_columns = obj.columns @@ -3086,7 +3208,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): x if x is not None else v._constructor(**v._construct_axes_dict()) for x in values - ] + ] v = values[0] @@ -3146,12 +3268,14 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # normally use vstack as its faster than concat # and if we have mi-columns - if isinstance(v.index, MultiIndex) or key_index is None: + if isinstance(v.index, + MultiIndex) or key_index is None: stacked_values = np.vstack(map(np.asarray, values)) result = DataFrame(stacked_values, index=key_index, columns=index) else: - # GH5788 instead of stacking; concat gets the dtypes correct + # GH5788 instead of stacking; concat gets the + # dtypes correct from pandas.tools.merge import concat result = concat(values, keys=key_index, names=key_index.names, @@ -3169,15 +3293,15 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here - if (self._selected_obj.ndim == 2 and - self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): + so = self._selected_obj + if (so.ndim == 2 and so.dtypes.isin(_DATELIKE_DTYPES).any()): result = result._convert(numeric=True) date_cols = self._selected_obj.select_dtypes( include=list(_DATELIKE_DTYPES)).columns date_cols = date_cols.intersection(result.columns) result[date_cols] = (result[date_cols] ._convert(datetime=True, - coerce=True)) + coerce=True)) else: result = result._convert(datetime=True) @@ -3185,10 +3309,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): else: # only coerce dates if we find at least 1 datetime - coerce = True if any([ isinstance(v,Timestamp) for v in values ]) else False + coerce = True if any([isinstance(x, Timestamp) + for x in values]) else False return (Series(values, index=key_index) ._convert(datetime=True, - coerce=coerce)) + coerce=coerce)) else: # Handle cases like BinGrouper @@ -3280,17 +3405,17 @@ def transform(self, func, *args, **kwargs): return self._transform_general(func, *args, **kwargs) results = np.empty_like(obj.values, result.values.dtype) - indices = self.indices for (name, group), (i, row) in zip(self, result.iterrows()): indexer = self._get_index(name) if len(indexer) > 0: - results[indexer] = np.tile(row.values,len(indexer)).reshape(len(indexer),-1) + results[indexer] = np.tile(row.values, len( + indexer)).reshape(len(indexer), -1) counts = self.size().fillna(0).values if any(counts == 0): results = self._try_cast(results, obj[result.columns]) - return (DataFrame(results,columns=result.columns,index=obj.index) + return (DataFrame(results, columns=result.columns, index=obj.index) ._convert(datetime=True)) def _define_paths(self, func, *args, **kwargs): @@ -3344,7 +3469,7 @@ def _transform_item_by_item(self, obj, wrapper): return DataFrame(output, index=obj.index, columns=columns) - def filter(self, func, dropna=True, *args, **kwargs): + def filter(self, func, dropna=True, *args, **kwargs): # noqa """ Return a copy of a DataFrame excluding elements from groups that do not satisfy the boolean criterion specified by func. @@ -3399,7 +3524,7 @@ class DataFrameGroupBy(NDFrameGroupBy): _apply_whitelist = _dataframe_apply_whitelist # # Make class defs of attributes on DataFrameGroupBy whitelist. - for _def_str in _whitelist_method_generator(DataFrame,_apply_whitelist) : + for _def_str in _whitelist_method_generator(DataFrame, _apply_whitelist): exec(_def_str) _block_agg_axis = 1 @@ -3517,8 +3642,9 @@ def _wrap_agged_blocks(self, items, blocks): def _reindex_output(self, result): """ if we have categorical groupers, then we want to make sure that - we have a fully reindex-output to the levels. These may have not participated in - the groupings (e.g. may have all been nan groups) + we have a fully reindex-output to the levels. These may have not + participated in the groupings (e.g. may have all been + nan groups) This can re-expand the output space """ @@ -3531,9 +3657,9 @@ def _reindex_output(self, result): for ping in groupings]): return result - levels_list = [ ping.group_index for ping in groupings ] + levels_list = [ping.group_index for ping in groupings] index = MultiIndex.from_product(levels_list, names=self.grouper.names) - d = { self.obj._get_axis_name(self.axis) : index, 'copy' : False } + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} return result.reindex(**d).sortlevel(axis=self.axis) def _iterate_column_groupbys(self): @@ -3667,7 +3793,7 @@ class NDArrayGroupBy(GroupBy): pass -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Splitting / application @@ -3784,7 +3910,7 @@ def get_splitter(data, *args, **kwargs): return klass(data, *args, **kwargs) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Misc utilities @@ -3835,7 +3961,7 @@ def loop(labels, shape): stride //= shape[i] out += labels[i] * stride - if xnull: # exclude nulls + if xnull: # exclude nulls mask = labels[0] == -1 for lab in labels[1:nlev]: mask |= lab == -1 @@ -3913,7 +4039,7 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels, xnull): # obs ids are deconstructable! take the fast route! out = decons_group_index(obs_ids, shape) return out if xnull or not lift.any() \ - else [x - y for x, y in zip(out, lift)] + else [x - y for x, y in zip(out, lift)] i = unique_label_indices(comp_ids) i8copy = lambda a: a.astype('i8', subok=False, copy=True) @@ -3948,25 +4074,25 @@ def _lexsort_indexer(keys, orders=None, na_position='last'): # create the Categorical else: - c = Categorical(key,ordered=True) + c = Categorical(key, ordered=True) - if na_position not in ['last','first']: + if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) n = len(c.categories) codes = c.codes.copy() mask = (c.codes == -1) - if order: # ascending + if order: # ascending if na_position == 'last': codes = np.where(mask, n, codes) elif na_position == 'first': codes += 1 - else: # not order means descending + else: # not order means descending if na_position == 'last': - codes = np.where(mask, n, n-codes-1) + codes = np.where(mask, n, n - codes - 1) elif na_position == 'first': - codes = np.where(mask, 0, n-codes) + codes = np.where(mask, 0, n - codes) if mask.any(): n += 1 @@ -3975,10 +4101,11 @@ def _lexsort_indexer(keys, orders=None, na_position='last'): return _indexer_from_factorized(labels, shape) + def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): """ - This is intended to be a drop-in replacement for np.argsort which handles NaNs - It adds ascending and na_position parameters. + This is intended to be a drop-in replacement for np.argsort which + handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ @@ -3998,7 +4125,8 @@ def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] - # Finally, place the NaNs at the end or the beginning according to na_position + # Finally, place the NaNs at the end or the beginning according to + # na_position if na_position == 'last': indexer = np.concatenate([indexer, nan_idx]) elif na_position == 'first': @@ -4038,8 +4166,8 @@ def _get_indices_dict(label_list, keys): group_index = get_group_index(label_list, shape, sort=True, xnull=True) ngroups = ((group_index.size and group_index.max()) + 1) \ - if _int64_overflow_possible(shape) \ - else np.prod(shape, dtype='i8') + if _int64_overflow_possible(shape) \ + else np.prod(shape, dtype='i8') sorter = _get_group_index_sorter(group_index, ngroups) @@ -4049,7 +4177,7 @@ def _get_indices_dict(label_list, keys): return lib.indices_fast(sorter, group_index, keys, sorted_labels) -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # sorting levels...cleverly? def _get_group_index_sorter(group_index, ngroups): @@ -4068,7 +4196,7 @@ def _get_group_index_sorter(group_index, ngroups): """ count = len(group_index) alpha = 0.0 # taking complexities literally; there may be - beta = 1.0 # some room for fine-tuning these parameters + beta = 1.0 # some room for fine-tuning these parameters if alpha + beta * ngroups < count * np.log(count): sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index), ngroups) @@ -4119,7 +4247,8 @@ def _reorder_by_uniques(uniques, labels): def _groupby_indices(values): - return _algos.groupby_indices(_values_from_object(com._ensure_object(values))) + return _algos.groupby_indices(_values_from_object( + com._ensure_object(values))) def numpy_groupby(data, labels, axis=0): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 938e6f16e0531..01df9218c1936 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -24,7 +24,7 @@ is_integer_dtype, is_categorical_dtype, is_object_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, PerformanceWarning) + is_bool_dtype, PerformanceWarning, ABCSeries) # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory @@ -274,7 +274,7 @@ class _TimeOp(object): def __init__(self, left, right, name, na_op): # need to make sure that we are aligning the data - if isinstance(left, pd.Series) and isinstance(right, pd.Series): + if isinstance(left, ABCSeries) and isinstance(right, ABCSeries): left, right = left.align(right, copy=False) lvalues = self._convert_to_array(left, name=name) @@ -412,9 +412,9 @@ def _convert_to_array(self, values, name=None, other=None): values = pd.DatetimeIndex(values) # datetime array with tz elif com.is_datetimetz(values): - if isinstance(values, pd.Series): + if isinstance(values, ABCSeries): values = values._values - elif not (isinstance(values, (np.ndarray, pd.Series)) and + elif not (isinstance(values, (np.ndarray, ABCSeries)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): @@ -579,7 +579,7 @@ def na_op(x, y): result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: - if isinstance(y, (np.ndarray, pd.Series, pd.Index)): + if isinstance(y, (np.ndarray, ABCSeries, pd.Index)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) @@ -619,7 +619,7 @@ def wrapper(left, right, name=name, na_op=na_op): wrap_results = time_converted.wrap_results na_op = time_converted.na_op - if isinstance(rvalues, pd.Series): + if isinstance(rvalues, ABCSeries): rindex = getattr(rvalues, 'index', rvalues) name = _maybe_match_name(left, rvalues) lvalues = getattr(lvalues, 'values', lvalues) @@ -672,7 +672,7 @@ def na_op(x, y): if isinstance(y, list): y = lib.list_to_object_array(y) - if isinstance(y, (np.ndarray, pd.Series)): + if isinstance(y, (np.ndarray, ABCSeries)): if not is_object_dtype(y.dtype): result = lib.vec_compare(x, y.astype(np.object_), op) else: @@ -727,7 +727,7 @@ def wrapper(self, other, axis=None): if axis is not None: self._get_axis_number(axis) - if isinstance(other, pd.Series): + if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) if len(self) != len(other): raise ValueError('Series lengths must match to compare') @@ -785,7 +785,7 @@ def na_op(x, y): if isinstance(y, list): y = lib.list_to_object_array(y) - if isinstance(y, (np.ndarray, pd.Series)): + if isinstance(y, (np.ndarray, ABCSeries)): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: @@ -812,7 +812,7 @@ def wrapper(self, other): fill_int = lambda x: x.fillna(0) fill_bool = lambda x: x.fillna(False).astype(bool) - if isinstance(other, pd.Series): + if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) other = other.reindex_like(self) is_other_int_dtype = is_integer_dtype(other.dtype) @@ -923,9 +923,9 @@ def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # validate axis self._get_axis_number(axis) - if isinstance(other, pd.Series): + if isinstance(other, ABCSeries): return self._binop(other, op, level=level, fill_value=fill_value) - elif isinstance(other, (np.ndarray, pd.Series, list, tuple)): + elif isinstance(other, (np.ndarray, ABCSeries, list, tuple)): if len(other) != len(self): raise ValueError('Lengths must be equal') return self._binop(self._constructor(other, self.index), op, @@ -981,7 +981,7 @@ def na_op(x, y): raise_on_error=True, **eval_kwargs) except TypeError: xrav = x.ravel() - if isinstance(y, (np.ndarray, pd.Series)): + if isinstance(y, (np.ndarray, ABCSeries)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) yrav = y.ravel() @@ -1053,7 +1053,7 @@ def na_op(x, y): def f(self, other, axis=default_axis, level=None, fill_value=None): if isinstance(other, pd.DataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) - elif isinstance(other, pd.Series): + elif isinstance(other, ABCSeries): return self._combine_series(other, na_op, fill_value, axis, level) elif isinstance(other, (list, tuple)): if axis is not None and self._get_axis_name(axis) == 'index': @@ -1102,7 +1102,7 @@ def na_op(x, y): except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=x.dtype) - if isinstance(y, (np.ndarray, pd.Series)): + if isinstance(y, (np.ndarray, ABCSeries)): yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) result[mask] = op(np.array(list(xrav[mask])), @@ -1124,7 +1124,7 @@ def f(self, other, axis=default_axis, level=None): if isinstance(other, pd.DataFrame): # Another DataFrame return self._flex_compare_frame(other, na_op, str_rep, level) - elif isinstance(other, pd.Series): + elif isinstance(other, ABCSeries): return self._combine_series(other, na_op, None, axis, level) elif isinstance(other, (list, tuple)): @@ -1167,7 +1167,7 @@ def _comp_method_FRAME(func, name, str_rep, masker=False): def f(self, other): if isinstance(other, pd.DataFrame): # Another DataFrame return self._compare_frame(other, func, str_rep) - elif isinstance(other, pd.Series): + elif isinstance(other, ABCSeries): return self._combine_series_infer(other, func) else: @@ -1253,7 +1253,7 @@ def f(self, other): if isinstance(other, self._constructor): return self._compare_constructor(other, na_op) elif isinstance(other, (self._constructor_sliced, pd.DataFrame, - pd.Series)): + ABCSeries)): raise Exception("input needs alignment for this object [%s]" % self._constructor) else: diff --git a/pandas/core/window.py b/pandas/core/window.py index 40906cf8e5363..9c8490f608996 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -76,7 +76,7 @@ def _convert_freq(self, how=None): "to passing to a window function", FutureWarning, stacklevel=6) - obj = obj.resample(self.freq, how=how) + obj = obj.resample(self.freq).aggregate(how or 'asfreq') return obj def _create_blocks(self, how): diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index ccfe50991fc67..389d41327d75c 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1139,7 +1139,7 @@ def test_to_excel_periodindex(self): _skip_if_no_xlrd() frame = self.tsframe - xp = frame.resample('M', kind='period') + xp = frame.resample('M', kind='period').mean() with ensure_clean(self.ext) as path: xp.to_excel(path, 'sht1') diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3754155cca0a3..2a97fdad8dd44 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -696,13 +696,13 @@ def test_metadata_propagation_indiv(self): ts = Series(np.random.rand(1000), index=date_range('20130101', periods=1000, freq='s'), name='foo') - result = ts.resample('1T') + result = ts.resample('1T').mean() self.check_metadata(ts, result) - result = ts.resample('1T', how='min') + result = ts.resample('1T').min() self.check_metadata(ts, result) - result = ts.resample('1T', how=lambda x: x.sum()) + result = ts.resample('1T').apply(lambda x: x.sum()) self.check_metadata(ts, result) _metadata = Series._metadata diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 30c2621cd64ef..0f99d367de6fd 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1462,42 +1462,97 @@ def test_aggregate_api_consistency(self): # make sure that the aggregates via dict # are consistent - def compare(result, expected): - # if we ar passin dicts then ordering is not guaranteed for output - # columns - assert_frame_equal(result.reindex_like(expected), expected) - - df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) grouped = df.groupby(['A', 'B']) - result = grouped[['D', 'C']].agg({'r': np.sum, 'r2': np.mean}) - expected = pd.concat([grouped[['D', 'C']].sum(), - grouped[['D', 'C']].mean()], - keys=['r', 'r2'], - axis=1).stack(level=1) - compare(result, expected) - - result = grouped[['D', 'C']].agg({'r': {'C': np.sum}, - 'r2': {'D': np.mean}}) - expected = pd.concat([grouped[['C']].sum(), - grouped[['D']].mean()], + c_mean = grouped['C'].mean() + c_sum = grouped['C'].sum() + d_mean = grouped['D'].mean() + d_sum = grouped['D'].sum() + + result = grouped['D'].agg(['sum', 'mean']) + expected = pd.concat([d_sum, d_mean], axis=1) - expected.columns = MultiIndex.from_tuples([('r', 'C'), ('r2', 'D')]) - compare(result, expected) + expected.columns = ['sum', 'mean'] + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg([np.sum, np.mean]) + expected = pd.concat([c_sum, + c_mean, + d_sum, + d_mean], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) result = grouped[['D', 'C']].agg([np.sum, np.mean]) - expected = pd.concat([grouped['D'].sum(), - grouped['D'].mean(), - grouped['C'].sum(), - grouped['C'].mean()], + expected = pd.concat([d_sum, + d_mean, + c_sum, + c_mean], + axis=1) + expected.columns = MultiIndex.from_product([['D', 'C'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': 'mean', 'D': 'sum'}) + expected = pd.concat([d_sum, + c_mean], + axis=1) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': ['mean', 'sum'], + 'D': ['mean', 'sum']}) + expected = pd.concat([c_mean, + c_sum, + d_mean, + d_sum], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['mean', 'sum']]) + + result = grouped[['D', 'C']].agg({'r': np.sum, + 'r2': np.mean}) + expected = pd.concat([d_sum, + c_sum, + d_mean, + c_mean], axis=1) - expected.columns = MultiIndex.from_product([['D', 'C'], ['sum', 'mean'] - ]) - compare(result, expected) + expected.columns = MultiIndex.from_product([['r', 'r2'], + ['D', 'C']]) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_nested_dicts(self): + + # API change for disallowing these types of nested dicts + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + g = df.groupby(['A', 'B']) + + def f(): + g.aggregate({'r1': {'C': ['mean', 'sum']}, + 'r2': {'D': ['mean', 'sum']}}) + + self.assertRaises(SpecificationError, f) + + result = g.agg({'C': {'ra': ['mean', 'std']}, + 'D': {'rb': ['mean', 'std']}}) + expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), + g['D'].std()], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( + 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) + assert_frame_equal(result, expected, check_like=True) def test_multi_iter(self): s = Series(np.arange(6)) @@ -4333,7 +4388,7 @@ def test_groupby_with_timegrouper(self): expected.iloc[[0, 6, 18], 0] = np.array( [24., 6., 9.], dtype='float64') - result1 = df.resample('5D', how=sum) + result1 = df.resample('5D') .sum() assert_frame_equal(result1, expected) df_sorted = df.sort_index() @@ -4549,7 +4604,7 @@ def test_timegrouper_with_reg_groups(self): for freq in ['D', 'M', 'A', 'Q-APR']: expected = df.groupby('user_id')[ 'whole_cost'].resample( - freq, how='sum').dropna().reorder_levels( + freq).sum().dropna().reorder_levels( ['date', 'user_id']).sortlevel().astype('int64') expected.name = 'whole_cost' @@ -5269,9 +5324,9 @@ def test_groupby_selection_with_methods(self): assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) - assert_frame_equal(g.resample('D'), g_exp.resample('D')) - assert_frame_equal(g.resample('D', how='ohlc'), - g_exp.resample('D', how='ohlc')) + assert_frame_equal(g.resample('D').mean(), g_exp.resample('D').mean()) + assert_frame_equal(g.resample('D').ohlc(), + g_exp.resample('D').ohlc()) assert_frame_equal(g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)) @@ -5471,7 +5526,7 @@ def test_tab_completion(self): 'cumprod', 'tail', 'resample', 'cummin', 'fillna', 'cumsum', 'cumcount', 'all', 'shift', 'skew', 'bfill', 'ffill', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', - 'dtypes', 'diff', 'idxmax', 'idxmin']) + 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin']) self.assertEqual(results, expected) def test_lexsort_indexer(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 6302c011a4491..cd9f44317da49 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1211,10 +1211,10 @@ def test_stack_multiple_bug(self): multi = df.set_index(['DATE', 'ID']) multi.columns.name = 'Params' unst = multi.unstack('ID') - down = unst.resample('W-THU') + down = unst.resample('W-THU').mean() rs = down.stack('ID') - xp = unst.ix[:, ['VAR1']].resample('W-THU').stack('ID') + xp = unst.ix[:, ['VAR1']].resample('W-THU').mean().stack('ID') xp.columns.name = 'Params' assert_frame_equal(rs, xp) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index d3e8320fd282d..65f90c320bb68 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -18,6 +18,7 @@ import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow +from pandas.core.base import SpecificationError import pandas.util.testing as tm from pandas.compat import range, zip, PY3 @@ -121,10 +122,6 @@ def test_agg(self): b_std = r['B'].std() b_sum = r['B'].sum() - def compare(result, expected): - # if we are using dicts, the orderings is not guaranteed - assert_frame_equal(result.reindex_like(expected), expected) - result = r.aggregate([np.mean, np.std]) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', @@ -132,8 +129,9 @@ def compare(result, expected): assert_frame_equal(result, expected) result = r.aggregate({'A': np.mean, 'B': np.std}) + expected = pd.concat([a_mean, b_std], axis=1) - compare(result, expected) + assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': ['mean', 'std']}) expected = pd.concat([a_mean, a_std], axis=1) @@ -150,7 +148,7 @@ def compare(result, expected): expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) - compare(result, expected) + assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, @@ -159,33 +157,19 @@ def compare(result, expected): expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ( 'A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) - compare(result, expected) + assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ( 'A', 'std'), ('B', 'mean'), ('B', 'std')]) - compare(result, expected) - - result = r.aggregate({'r1': {'A': ['mean', 'sum']}, - 'r2': {'B': ['mean', 'sum']}}) - expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'), ( - 'r1', 'A', 'sum'), ('r2', 'B', 'mean'), ('r2', 'B', 'sum')]) - compare(result, expected) - - result = r.agg({'A': {'ra': ['mean', 'std']}, - 'B': {'rb': ['mean', 'std']}}) - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), ( - 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) - compare(result, expected) + assert_frame_equal(result, expected, check_like=True) # passed lambda result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = r['B'].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([a_sum, rcustom], axis=1) - compare(result, expected) + assert_frame_equal(result, expected, check_like=True) def test_agg_consistency(self): @@ -204,6 +188,32 @@ def test_agg_consistency(self): expected = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'mean')]) tm.assert_index_equal(result, expected) + def test_agg_nested_dicts(self): + + # API change for disallowing these types of nested dicts + df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) + r = df.rolling(window=3) + + def f(): + r.aggregate({'r1': {'A': ['mean', 'sum']}, + 'r2': {'B': ['mean', 'sum']}}) + + self.assertRaises(SpecificationError, f) + + expected = pd.concat([r['A'].mean(), r['A'].std(), r['B'].mean(), + r['B'].std()], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( + 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) + result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) + assert_frame_equal(result, expected, check_like=True) + + result = r.agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) + expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), ( + 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) + assert_frame_equal(result, expected, check_like=True) + def test_window_with_args(self): tm._skip_if_no_scipy() diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py index 729e85b0ad595..5e26d5dbf9387 100644 --- a/pandas/tseries/plotting.py +++ b/pandas/tseries/plotting.py @@ -71,8 +71,8 @@ def _maybe_resample(series, ax, kwargs): freq = ax_freq elif _is_sup(freq, ax_freq): # one is weekly how = kwargs.pop('how', 'last') - series = series.resample('D', how=how).dropna() - series = series.resample(ax_freq, how=how).dropna() + series = getattr(series.resample('D'), how)().dropna() + series = getattr(series.resample(ax_freq), how)().dropna() freq = ax_freq elif frequencies.is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq): _upsample_others(ax, freq, kwargs) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 0ecdb43895f07..4e7962686db59 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1,6 +1,13 @@ from datetime import timedelta import numpy as np -from pandas.core.groupby import BinGrouper, Grouper +import warnings + +import pandas as pd +from pandas.core.base import AbstractMethodError + +from pandas.core.groupby import (BinGrouper, Grouper, _GroupBy, GroupBy, + SeriesGroupBy, groupby, PanelGroupBy) + from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod from pandas.tseries.index import DatetimeIndex, date_range from pandas.tseries.tdi import TimedeltaIndex @@ -14,7 +21,682 @@ import pandas.tslib as tslib -_DEFAULT_METHOD = 'mean' +class Resampler(_GroupBy): + + """ + Class for resampling datetimelike data, a groupby-like operation. + See aggregate, transform, and apply functions on this object. + + It's easiest to use obj.resample(...) to use Resampler. + + Parameters + ---------- + obj : pandas object + groupby : a TimeGrouper object + axis : int, default 0 + kind : str or None + 'period', 'timestamp' to override default index treatement + + Notes + ----- + After resampling, see aggregate, apply, and transform functions. + + Returns + ------- + a Resampler of the appropriate type + """ + + # to the groupby descriptor + _attributes = ['freq', 'axis', 'closed', 'label', 'convention', + 'loffset', 'base', 'kind'] + + # API compat of allowed attributes + _deprecated_valids = _attributes + ['_ipython_display_', '__doc__', + '_cache', '_attributes', 'binner', + 'grouper', 'groupby', 'keys', + 'sort', 'kind', 'squeeze', + 'group_keys', 'as_index', + 'exclusions'] + + # API compat of disallowed attributes + _deprecated_invalids = ['iloc', 'loc', 'ix', 'iat', 'at'] + + def __init__(self, obj, groupby, axis=0, kind=None, **kwargs): + self.groupby = groupby + self.keys = None + self.sort = True + self.axis = axis + self.kind = kind + self.squeeze = False + self.group_keys = True + self.as_index = True + self.exclusions = set() + self.binner = None + self.grouper = None + + self.groupby._set_grouper(self._convert_obj(obj), sort=True) + + def __unicode__(self): + """ provide a nice str repr of our rolling object """ + attrs = ["{k}={v}".format(k=k, v=getattr(self.groupby, k)) + for k in self._attributes if + getattr(self.groupby, k, None) is not None] + return "{klass} [{attrs}]".format(klass=self.__class__.__name__, + attrs=', '.join(attrs)) + + @property + def obj(self): + return self.groupby.obj + + @property + def ax(self): + return self.groupby.ax + + @property + def _typ(self): + """ masquerade for compat as a Series or a DataFrame """ + if isinstance(self._selected_obj, pd.Series): + return 'series' + return 'dataframe' + + def _deprecated(self): + warnings.warn(".resample() is now a deferred operation\n" + "use .resample(...).mean() instead of .resample(...)", + FutureWarning, stacklevel=2) + return self.mean() + + def _make_deprecated_binop(op): + # op is a string + + def _evaluate_numeric_binop(self, other): + result = self._deprecated() + return getattr(result, op)(other) + return _evaluate_numeric_binop + + def _make_deprecated_unary(op): + # op is a callable + + def _evaluate_numeric_unary(self): + result = self._deprecated() + return op(result) + return _evaluate_numeric_unary + + def __array__(self): + return self._deprecated().__array__() + + __gt__ = _make_deprecated_binop('__gt__') + __ge__ = _make_deprecated_binop('__ge__') + __lt__ = _make_deprecated_binop('__lt__') + __le__ = _make_deprecated_binop('__le__') + __eq__ = _make_deprecated_binop('__eq__') + __ne__ = _make_deprecated_binop('__ne__') + + __add__ = __radd__ = _make_deprecated_binop('__add__') + __sub__ = __rsub__ = _make_deprecated_binop('__sub__') + __mul__ = __rmul__ = _make_deprecated_binop('__mul__') + __floordiv__ = __rfloordiv__ = _make_deprecated_binop('__floordiv__') + __truediv__ = __rtruediv__ = _make_deprecated_binop('__truediv__') + if not compat.PY3: + __div__ = __rdiv__ = _make_deprecated_binop('__div__') + __neg__ = _make_deprecated_unary(lambda x: -x) + __pos__ = _make_deprecated_unary(lambda x: x) + __abs__ = _make_deprecated_unary(lambda x: np.abs(x)) + __inv__ = _make_deprecated_unary(lambda x: -x) + + def __getattr__(self, attr): + if attr in self._internal_names_set: + return object.__getattribute__(self, attr) + if attr in self._attributes: + return getattr(self.groupby, attr) + if attr in self.obj: + return self[attr] + + if attr in self._deprecated_invalids: + raise ValueError(".resample() is now a deferred operation\n" + "\tuse .resample(...).mean() instead of " + ".resample(...)\n" + "\tassignment will have no effect as you " + "are working on a copy") + if attr not in self._deprecated_valids: + self = self._deprecated() + return object.__getattribute__(self, attr) + + def __setattr__(self, attr, value): + if attr not in self._deprecated_valids: + raise ValueError("cannot set values on {0}".format( + self.__class__.__name__)) + object.__setattr__(self, attr, value) + + def __setitem__(self, attr, value): + raise ValueError("cannot set items on {0}".format( + self.__class__.__name__)) + + def _convert_obj(self, obj): + """ + provide any conversions for the object in order to correctly handle + + Parameters + ---------- + obj : the object to be resampled + """ + obj = obj.consolidate() + return obj + + def _get_binner_for_time(self): + raise AbstractMethodError(self) + + def _set_binner(self): + """ + setup our binners + cache these as we are an immutable object + """ + + if self.binner is None: + self.binner, self.grouper = self._get_binner() + + def _get_binner(self): + """ + create the BinGrouper, assume that self.set_grouper(obj) + has already been called + """ + + binner, bins, binlabels = self._get_binner_for_time() + bin_grouper = BinGrouper(bins, binlabels) + return binner, bin_grouper + + def _assure_grouper(self): + """ make sure that we are creating our binner & grouper """ + self._set_binner() + + def aggregate(self, arg, *args, **kwargs): + """ + Apply aggregation function or functions to resampled groups, yielding + most likely Series but in some cases DataFrame depending on the output + of the aggregation function + + Parameters + ---------- + func_or_funcs : function or list / dict of functions + List/dict of functions will produce DataFrame with column names + determined by the function names themselves (list) or the keys in + the dict + + Notes + ----- + agg is an alias for aggregate. Use it. + + Examples + -------- + >>> s = Series([1,2,3,4,5], + index=pd.date_range('20130101', + periods=5,freq='s')) + 2013-01-01 00:00:00 1 + 2013-01-01 00:00:01 2 + 2013-01-01 00:00:02 3 + 2013-01-01 00:00:03 4 + 2013-01-01 00:00:04 5 + Freq: S, dtype: int64 + + >>> r = s.resample('2s') + DatetimeIndexResampler [freq=<2 * Seconds>, axis=0, closed=left, + label=left, convention=start, base=0] + + >>> r.agg(np.sum) + 2013-01-01 00:00:00 3 + 2013-01-01 00:00:02 7 + 2013-01-01 00:00:04 5 + Freq: 2S, dtype: int64 + + >>> r.agg(['sum','mean','max']) + sum mean max + 2013-01-01 00:00:00 3 1.5 2 + 2013-01-01 00:00:02 7 3.5 4 + 2013-01-01 00:00:04 5 5.0 5 + + >>> r.agg({'result' : lambda x: x.mean() / x.std(), + 'total' : np.sum}) + total result + 2013-01-01 00:00:00 3 2.121320 + 2013-01-01 00:00:02 7 4.949747 + 2013-01-01 00:00:04 5 NaN + + See also + -------- + transform + + Returns + ------- + Series or DataFrame + """ + + self._set_binner() + result, how = self._aggregate(arg, *args, **kwargs) + if result is None: + return self._groupby_and_aggregate(self.grouper, + arg, + *args, + **kwargs) + + return result + + agg = aggregate + apply = aggregate + + def transform(self, arg, *args, **kwargs): + """ + Call function producing a like-indexed Series on each group and return + a Series with the transformed values + + Parameters + ---------- + func : function + To apply to each group. Should return a Series with the same index + + Examples + -------- + >>> resampled.transform(lambda x: (x - x.mean()) / x.std()) + + Returns + ------- + transformed : Series + """ + return self._selected_obj.groupby(self.groupby).transform( + arg, *args, **kwargs) + + def _downsample(self, f): + raise AbstractMethodError(self) + + def _upsample(self, f, limit=None): + raise AbstractMethodError(self) + + def _gotitem(self, key, ndim, subset=None): + """ + sub-classes to define + return a sliced object + + Parameters + ---------- + key : string / list of selections + ndim : 1,2 + requested ndim of result + subset : object, default None + subset to act on + """ + self._set_binner() + grouper = self.grouper + if subset is None: + subset = self.obj + grouped = groupby(subset, by=None, grouper=grouper, axis=self.axis) + + # try the key selection + try: + return grouped[key] + except KeyError: + return grouped + + def _groupby_and_aggregate(self, grouper, how, *args, **kwargs): + """ revaluate the obj with a groupby aggregation """ + + if grouper is None: + self._set_binner() + grouper = self.grouper + + obj = self._selected_obj + + try: + grouped = groupby(obj, by=None, grouper=grouper, axis=self.axis) + except TypeError: + + # panel grouper + grouped = PanelGroupBy(obj, grouper=grouper, axis=self.axis) + + try: + result = grouped.aggregate(how, *args, **kwargs) + except Exception: + + # we have a non-reducing function + # try to evaluate + result = grouped.apply(how, *args, **kwargs) + + return self._wrap_result(result) + + def _wrap_result(self, result): + """ potentially wrap any results """ + return result + + def pad(self, limit=None): + """ + Forward fill the values + + Parameters + ---------- + limit : integer, optional + limit of how many values to fill + + See Also + -------- + Series.fillna + DataFrame.fillna + """ + return self._upsample('pad', limit=limit) + ffill = pad + + def backfill(self, limit=None): + """ + Backward fill the values + + Parameters + ---------- + limit : integer, optional + limit of how many values to fill + + See Also + -------- + Series.fillna + DataFrame.fillna + """ + return self._upsample('backfill', limit=limit) + bfill = backfill + + def fillna(self, method, limit=None): + """ + Parameters + ---------- + method : str, method of resampling ('ffill', 'bfill') + limit : integer, optional + limit of how many values to fill + + See Also + -------- + Series.fillna + DataFrame.fillna + """ + return self._upsample(method, limit=limit) + + def asfreq(self): + """ + return the values at the new freq, + essentially a reindex with (no filling) + """ + return self._upsample(None) + + def std(self, ddof=1): + """ + Compute standard deviation of groups, excluding missing values + + Parameters + ---------- + ddof : integer, default 1 + degrees of freedom + """ + return self._downsample('std', ddof=ddof) + + def var(self, ddof=1): + """ + Compute variance of groups, excluding missing values + + Parameters + ---------- + ddof : integer, default 1 + degrees of freedom + """ + return self._downsample('var', ddof=ddof) +Resampler._deprecated_valids += dir(Resampler) + +# downsample methods +for method in ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', + 'median', 'prod', 'ohlc']: + + def f(self, _method=method): + return self._downsample(_method) + f.__doc__ = getattr(GroupBy, method).__doc__ + setattr(Resampler, method, f) + +# groupby & aggregate methods +for method in ['count', 'size']: + + def f(self, _method=method): + return self._groupby_and_aggregate(None, _method) + f.__doc__ = getattr(GroupBy, method).__doc__ + setattr(Resampler, method, f) + +# series only methods +for method in ['nunique']: + def f(self, _method=method): + return self._groupby_and_aggregate(None, _method) + f.__doc__ = getattr(SeriesGroupBy, method).__doc__ + setattr(Resampler, method, f) + + +class DatetimeIndexResampler(Resampler): + + def _get_binner_for_time(self): + + # this is how we are actually creating the bins + if self.kind == 'period': + return self.groupby._get_time_period_bins(self.ax) + return self.groupby._get_time_bins(self.ax) + + def _downsample(self, how, **kwargs): + """ + Downsample the cython defined function + + Parameters + ---------- + how : string / cython mapped function + **kwargs : kw args passed to how function + """ + self._set_binner() + how = self._is_cython_func(how) or how + ax = self.ax + obj = self._selected_obj + + if not len(ax): + # reset to the new freq + obj = obj.copy() + obj.index.freq = self.freq + return obj + + # do we have a regular frequency + if ax.freq is not None or ax.inferred_freq is not None: + + if len(self.grouper.binlabels) > len(ax): + + # let's do an asfreq + return self.asfreq() + + # we are downsampling + # we want to call the actual grouper method here + result = obj.groupby( + self.grouper, axis=self.axis).aggregate(how, **kwargs) + + loffset = self.loffset + if isinstance(loffset, compat.string_types): + loffset = to_offset(self.loffset) + + if isinstance(loffset, (DateOffset, timedelta)) and \ + isinstance(result.index, DatetimeIndex) and \ + len(result.index) > 0: + result.index = result.index + loffset + + return self._wrap_result(result) + + def _upsample(self, method, limit=None): + """ + method : string {'backfill', 'bfill', 'pad', 'ffill'} + method for upsampling + limit : int, default None + Maximum size gap to fill when reindexing + + See also + -------- + .fillna + + """ + self._set_binner() + if self.axis: + raise AssertionError('axis must be 0') + + ax = self.ax + obj = self._selected_obj + binner = self.binner + + if self.closed == 'right': + res_index = binner[1:] + else: + res_index = binner[:-1] + + # if we have the same frequency as our axis, then we are equal sampling + if limit is None and to_offset(ax.inferred_freq) == self.freq: + result = obj.copy() + result.index = res_index + else: + result = obj.reindex(res_index, method=method, + limit=limit) + + return self._wrap_result(result) + + def _wrap_result(self, result): + result = super(DatetimeIndexResampler, self)._wrap_result(result) + + # we may have a different kind that we were asked originally + # convert if needed + if self.kind == 'period' and not isinstance(result.index, PeriodIndex): + result.index = result.index.to_period(self.freq) + return result + + +class PeriodIndexResampler(DatetimeIndexResampler): + + def _convert_obj(self, obj): + obj = super(PeriodIndexResampler, self)._convert_obj(obj) + + offset = to_offset(self.freq) + if offset.n > 1: + if self.kind == 'period': # pragma: no cover + print('Warning: multiple of frequency -> timestamps') + + # Cannot have multiple of periods, convert to timestamp + self.kind = 'timestamp' + + if not len(obj): + self.kind = 'timestamp' + + # convert to timestamp + if not (self.kind is None or self.kind == 'period'): + obj = obj.to_timestamp(how=self.convention) + return obj + + def aggregate(self, arg, *args, **kwargs): + result, how = self._aggregate(arg, *args, **kwargs) + if result is None: + result = self._downsample(arg, *args, **kwargs) + + return result + + agg = aggregate + + def _get_new_index(self): + """ return our new index """ + ax = self.ax + obj = self._selected_obj + + if len(ax) == 0: + new_index = PeriodIndex(data=[], freq=self.freq) + return obj.reindex(new_index) + + start = ax[0].asfreq(self.freq, how=self.convention) + end = ax[-1].asfreq(self.freq, how='end') + + return period_range(start, end, freq=self.freq) + + def _downsample(self, how, **kwargs): + """ + Downsample the cython defined function + + Parameters + ---------- + how : string / cython mapped function + **kwargs : kw args passed to how function + """ + + # we may need to actually resample as if we are timestamps + if self.kind == 'timestamp': + return super(PeriodIndexResampler, self)._downsample(how, **kwargs) + + how = self._is_cython_func(how) or how + ax = self.ax + + new_index = self._get_new_index() + if len(new_index) == 0: + return self._wrap_result(new_index) + + # Start vs. end of period + memb = ax.asfreq(self.freq, how=self.convention) + + if is_subperiod(ax.freq, self.freq): + # Downsampling + rng = np.arange(memb.values[0], memb.values[-1] + 1) + bins = memb.searchsorted(rng, side='right') + grouper = BinGrouper(bins, new_index) + return self._groupby_and_aggregate(grouper, how) + elif is_superperiod(ax.freq, self.freq): + return self.asfreq() + + raise ValueError('Frequency {axfreq} cannot be ' + 'resampled to {freq}'.format( + axfreq=ax.freq, + freq=self.freq)) + + def _upsample(self, method, limit=None): + """ + method : string {'backfill', 'bfill', 'pad', 'ffill'} + method for upsampling + limit : int, default None + Maximum size gap to fill when reindexing + + See also + -------- + .fillna + + """ + # we may need to actually resample as if we are timestamps + if self.kind == 'timestamp': + return super(PeriodIndexResampler, self)._upsample(method, + limit=limit) + + ax = self.ax + obj = self.obj + + new_index = self._get_new_index() + if len(new_index) == 0: + return self._wrap_result(new_index) + + if not is_superperiod(ax.freq, self.freq): + return self.asfreq() + + # Start vs. end of period + memb = ax.asfreq(self.freq, how=self.convention) + + # Get the fill indexer + indexer = memb.get_indexer(new_index, method=method, limit=limit) + return self._wrap_result(_take_new_index(obj, + indexer, + new_index, + axis=self.axis)) + + +class TimedeltaResampler(DatetimeIndexResampler): + + def _get_binner_for_time(self): + return self.groupby._get_time_delta_bins(self.ax) + + +def resample(obj, kind=None, **kwds): + """ create a TimeGrouper and return our resampler """ + tg = TimeGrouper(**kwds) + return tg._get_resampler(obj, kind=kind) +resample.__doc__ = Resampler.__doc__ class TimeGrouper(Grouper): @@ -35,6 +717,7 @@ class TimeGrouper(Grouper): Use begin, end, nperiods to generate intervals that cannot be derived directly from the associated object """ + def __init__(self, freq='Min', closed=None, label=None, how='mean', nperiods=None, axis=0, fill_method=None, limit=None, loffset=None, kind=None, @@ -74,40 +757,52 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', super(TimeGrouper, self).__init__(freq=freq, axis=axis, **kwargs) - def resample(self, obj): - self._set_grouper(obj, sort=True) - ax = self.grouper + def _get_resampler(self, obj, kind=None): + """ + return my resampler or raise if we have an invalid axis + + Parameters + ---------- + obj : input object + kind : string, optional + 'period','timestamp','timedelta' are valid + + Returns + ------- + a Resampler + Raises + ------ + TypeError if incompatible axis + + """ + self._set_grouper(obj) + + ax = self.ax if isinstance(ax, DatetimeIndex): - rs = self._resample_timestamps() - elif isinstance(ax, PeriodIndex): - offset = to_offset(self.freq) - if offset.n > 1: - if self.kind == 'period': # pragma: no cover - print('Warning: multiple of frequency -> timestamps') - # Cannot have multiple of periods, convert to timestamp - self.kind = 'timestamp' - - if self.kind is None or self.kind == 'period': - rs = self._resample_periods() - else: - obj = self.obj.to_timestamp(how=self.convention) - self._set_grouper(obj) - rs = self._resample_timestamps() + return DatetimeIndexResampler(obj, + groupby=self, + kind=kind, + axis=self.axis) + elif isinstance(ax, PeriodIndex) or kind == 'period': + return PeriodIndexResampler(obj, + groupby=self, + kind=kind, + axis=self.axis) elif isinstance(ax, TimedeltaIndex): - rs = self._resample_timestamps(kind='timedelta') - elif len(ax) == 0: - return self.obj - else: # pragma: no cover - raise TypeError('Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex') - - rs_axis = rs._get_axis(self.axis) - rs_axis.name = ax.name - return rs + return TimedeltaResampler(obj, + groupby=self, + axis=self.axis) + + raise TypeError("Only valid with DatetimeIndex, " + "TimedeltaIndex or PeriodIndex, " + "but got an instance of %r" % type(ax).__name__) def _get_grouper(self, obj): - self._set_grouper(obj) - return self._get_binner_for_resample() + # create the resampler and return our binner + r = self._get_resampler(obj) + r._set_binner() + return r.binner, r.grouper, r.obj def _get_binner_for_resample(self, kind=None): # create the BinGrouper @@ -130,15 +825,12 @@ def _get_binner_for_grouping(self, obj): # return an ordering of the transformed group labels, # suitable for multi-grouping, e.g the labels for # the resampled intervals - ax = self._set_grouper(obj) - self._get_binner_for_resample() + binner, grouper, obj = self._get_grouper(obj) - # create the grouper - binner = self.binner l = [] - for key, group in self.grouper.get_iterator(ax): - l.extend([key]*len(group)) - grouper = binner.__class__(l,freq=binner.freq,name=binner.name) + for key, group in grouper.get_iterator(self.ax): + l.extend([key] * len(group)) + grouper = binner.__class__(l, freq=binner.freq, name=binner.name) # since we may have had to sort # may need to reorder groups here @@ -153,11 +845,13 @@ def _get_time_bins(self, ax): 'an instance of %r' % type(ax).__name__) if len(ax) == 0: - binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name) + binner = labels = DatetimeIndex( + data=[], freq=self.freq, name=ax.name) return binner, [], labels first, last = ax.min(), ax.max() - first, last = _get_range_edges(first, last, self.freq, closed=self.closed, + first, last = _get_range_edges(first, last, self.freq, + closed=self.closed, base=self.base) tz = ax.tz binner = labels = DatetimeIndex(freq=self.freq, @@ -178,7 +872,8 @@ def _get_time_bins(self, ax): binner, bin_edges = self._adjust_bin_edges(binner, ax_values) # general version, knowing nothing about relative frequencies - bins = lib.generate_bins_dt64(ax_values, bin_edges, self.closed, hasnans=ax.hasnans) + bins = lib.generate_bins_dt64( + ax_values, bin_edges, self.closed, hasnans=ax.hasnans) if self.closed == 'right': labels = binner @@ -227,7 +922,8 @@ def _get_time_delta_bins(self, ax): 'an instance of %r' % type(ax).__name__) if not len(ax): - binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name) + binner = labels = TimedeltaIndex( + data=[], freq=self.freq, name=ax.name) return binner, [], labels labels = binner = TimedeltaIndex(start=ax[0], @@ -250,7 +946,8 @@ def _get_time_period_bins(self, ax): 'an instance of %r' % type(ax).__name__) if not len(ax): - binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name) + binner = labels = PeriodIndex( + data=[], freq=self.freq, name=ax.name) return binner, [], labels labels = binner = PeriodIndex(start=ax[0], @@ -265,106 +962,6 @@ def _get_time_period_bins(self, ax): return binner, bins, labels - @property - def _agg_method(self): - return self.how if self.how else _DEFAULT_METHOD - - def _resample_timestamps(self, kind=None): - # assumes set_grouper(obj) already called - axlabels = self.ax - - self._get_binner_for_resample(kind=kind) - grouper = self.grouper - binner = self.binner - obj = self.obj - - # Determine if we're downsampling - if axlabels.freq is not None or axlabels.inferred_freq is not None: - - if len(grouper.binlabels) < len(axlabels) or self.how is not None: - # downsample - grouped = obj.groupby(grouper, axis=self.axis) - result = grouped.aggregate(self._agg_method) - # GH2073 - if self.fill_method is not None: - result = result.fillna(method=self.fill_method, - limit=self.limit) - - else: - # upsampling shortcut - if self.axis: - raise AssertionError('axis must be 0') - - if self.closed == 'right': - res_index = binner[1:] - else: - res_index = binner[:-1] - - # if we have the same frequency as our axis, then we are equal sampling - # even if how is None - if self.fill_method is None and self.limit is None and to_offset( - axlabels.inferred_freq) == self.freq: - result = obj.copy() - result.index = res_index - else: - result = obj.reindex(res_index, method=self.fill_method, - limit=self.limit) - else: - # Irregular data, have to use groupby - grouped = obj.groupby(grouper, axis=self.axis) - result = grouped.aggregate(self._agg_method) - - if self.fill_method is not None: - result = result.fillna(method=self.fill_method, - limit=self.limit) - - loffset = self.loffset - if isinstance(loffset, compat.string_types): - loffset = to_offset(self.loffset) - - if isinstance(loffset, (DateOffset, timedelta)): - if (isinstance(result.index, DatetimeIndex) - and len(result.index) > 0): - - result.index = result.index + loffset - - return result - - def _resample_periods(self): - # assumes set_grouper(obj) already called - axlabels = self.ax - obj = self.obj - - if len(axlabels) == 0: - new_index = PeriodIndex(data=[], freq=self.freq) - return obj.reindex(new_index) - else: - start = axlabels[0].asfreq(self.freq, how=self.convention) - end = axlabels[-1].asfreq(self.freq, how='end') - - new_index = period_range(start, end, freq=self.freq) - - # Start vs. end of period - memb = axlabels.asfreq(self.freq, how=self.convention) - - if is_subperiod(axlabels.freq, self.freq) or self.how is not None: - # Downsampling - rng = np.arange(memb.values[0], memb.values[-1] + 1) - bins = memb.searchsorted(rng, side='right') - grouper = BinGrouper(bins, new_index) - - grouped = obj.groupby(grouper, axis=self.axis) - return grouped.aggregate(self._agg_method) - elif is_superperiod(axlabels.freq, self.freq): - # Get the fill indexer - indexer = memb.get_indexer(new_index, method=self.fill_method, - limit=self.limit) - return _take_new_index(obj, indexer, new_index, axis=self.axis) - - else: - raise ValueError('Frequency %s cannot be resampled to %s' - % (axlabels.freq, self.freq)) - def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame @@ -410,8 +1007,6 @@ def _get_range_edges(first, last, offset, closed='left', base=0): def _adjust_dates_anchored(first, last, offset, closed='right', base=0): -# from pandas.tseries.tools import normalize_date - # First and last offsets should be calculated from the start day to fix an # error cause by resampling across multiple days when a one day period is # not a multiple of the frequency. diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 850071e8e49e6..b9326aa8e1c60 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -3,7 +3,7 @@ from datetime import datetime, timedelta from functools import partial -from pandas.compat import range, lrange, zip, product +from pandas.compat import range, lrange, zip, product, OrderedDict import numpy as np from pandas import (Series, DataFrame, Panel, Index, isnull, @@ -14,8 +14,11 @@ from pandas.tseries.tdi import timedelta_range from pandas.tseries.offsets import Minute, BDay from pandas.tseries.period import period_range, PeriodIndex, Period -from pandas.tseries.resample import DatetimeIndex, TimeGrouper +from pandas.tseries.resample import (DatetimeIndex, TimeGrouper, + DatetimeIndexResampler) from pandas.tseries.frequencies import MONTHS, DAYS +from pandas.core.common import ABCSeries, ABCDataFrame +from pandas.core.base import SpecificationError import pandas.tseries.offsets as offsets import pandas as pd @@ -29,6 +32,461 @@ bday = BDay() +class TestResampleAPI(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + dti = DatetimeIndex(start=datetime(2005, 1, 1), + end=datetime(2005, 1, 10), freq='Min') + + self.series = Series(np.random.rand(len(dti)), dti) + self.frame = DataFrame( + {'A': self.series, 'B': self.series, 'C': np.arange(len(dti))}) + + def test_str(self): + + r = self.series.resample('H') + self.assertTrue( + 'DatetimeIndexResampler [freq=, axis=0, closed=left, ' + 'label=left, convention=start, base=0]' in str(r)) + + def test_api(self): + + r = self.series.resample('H') + result = r.mean() + self.assertIsInstance(result, Series) + self.assertEqual(len(result), 217) + + r = self.series.to_frame().resample('H') + result = r.mean() + self.assertIsInstance(result, DataFrame) + self.assertEqual(len(result), 217) + + def test_api_changes_v018(self): + + # change from .resample(....., how=...) + # to .resample(......).how() + + r = self.series.resample('H') + self.assertIsInstance(r, DatetimeIndexResampler) + + for how in ['sum', 'mean', 'prod', 'min', 'max', 'var', 'std']: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = self.series.resample('H', how=how) + expected = getattr(self.series.resample('H'), how)() + tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = self.series.resample('H', how='ohlc') + expected = self.series.resample('H').ohlc() + tm.assert_frame_equal(result, expected) + + # compat for pandas-like methods + for how in ['sort_values', 'isnull']: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + getattr(r, how)() + + # invalids as these can be setting operations + r = self.series.resample('H') + self.assertRaises(ValueError, lambda: r.iloc[0]) + self.assertRaises(ValueError, lambda: r.iat[0]) + self.assertRaises(ValueError, lambda: r.ix[0]) + self.assertRaises(ValueError, lambda: r.loc[ + Timestamp('2013-01-01 00:00:00', offset='H')]) + self.assertRaises(ValueError, lambda: r.at[ + Timestamp('2013-01-01 00:00:00', offset='H')]) + + def f(): + r[0] = 5 + self.assertRaises(ValueError, f) + + # str/repr + r = self.series.resample('H') + with tm.assert_produces_warning(None): + str(r) + with tm.assert_produces_warning(None): + repr(r) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + tm.assert_numpy_array_equal(np.array(r), np.array(r.mean())) + + # masquerade as Series/DataFrame as needed for API compat + self.assertTrue(isinstance(self.series.resample('H'), ABCSeries)) + self.assertFalse(isinstance(self.frame.resample('H'), ABCSeries)) + self.assertFalse(isinstance(self.series.resample('H'), ABCDataFrame)) + self.assertTrue(isinstance(self.frame.resample('H'), ABCDataFrame)) + + # bin numeric ops + for op in ['__add__', '__mul__', '__truediv__', '__div__', '__sub__']: + + if getattr(self.series, op, None) is None: + continue + r = self.series.resample('H') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + self.assertIsInstance(getattr(r, op)(2), pd.Series) + + # unary numeric ops + for op in ['__pos__', '__neg__', '__abs__', '__inv__']: + + if getattr(self.series, op, None) is None: + continue + r = self.series.resample('H') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + self.assertIsInstance(getattr(r, op)(), pd.Series) + + # comparison ops + for op in ['__lt__', '__le__', '__gt__', '__ge__', '__eq__', '__ne__']: + + r = self.series.resample('H') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + self.assertIsInstance(getattr(r, op)(2), pd.Series) + + def test_getitem(self): + + r = self.frame.resample('H') + tm.assert_index_equal(r._selected_obj.columns, self.frame.columns) + + r = self.frame.resample('H')['B'] + self.assertEqual(r._selected_obj.name, self.frame.columns[1]) + + # technically this is allowed + r = self.frame.resample('H')['A', 'B'] + tm.assert_index_equal(r._selected_obj.columns, + self.frame.columns[[0, 1]]) + + r = self.frame.resample('H')['A', 'B'] + tm.assert_index_equal(r._selected_obj.columns, + self.frame.columns[[0, 1]]) + + def test_select_bad_cols(self): + + g = self.frame.resample('H') + self.assertRaises(KeyError, g.__getitem__, ['D']) + + self.assertRaises(KeyError, g.__getitem__, ['A', 'D']) + with tm.assertRaisesRegexp(KeyError, '^[^A]+$'): + # A should not be referenced as a bad column... + # will have to rethink regex if you change message! + g[['A', 'D']] + + def test_attribute_access(self): + + r = self.frame.resample('H') + tm.assert_series_equal(r.A.sum(), r['A'].sum()) + + # getting + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertRaises(AttributeError, lambda: r.F) + + # setting + def f(): + r.F = 'bah' + self.assertRaises(ValueError, f) + + def test_api_compat_before_use(self): + + # make sure that we are setting the binner + # on these attributes + for attr in ['groups', 'ngroups', 'indices']: + rng = pd.date_range('1/1/2012', periods=100, freq='S') + ts = pd.Series(np.arange(len(rng)), index=rng) + rs = ts.resample('30s') + + # before use + getattr(rs, attr) + + # after grouper is initialized is ok + rs.mean() + getattr(rs, attr) + + def tests_skip_nuisance(self): + + df = self.frame + df['D'] = 'foo' + r = df.resample('H') + result = r[['A', 'B']].sum() + expected = pd.concat([r.A.sum(), r.B.sum()], axis=1) + assert_frame_equal(result, expected) + + expected = r[['A', 'B', 'C']].sum() + result = r.sum() + assert_frame_equal(result, expected) + + def test_downsample_but_actually_upsampling(self): + + # this is reindex / asfreq + rng = pd.date_range('1/1/2012', periods=100, freq='S') + ts = pd.Series(np.arange(len(rng), dtype='int64'), index=rng) + result = ts.resample('20s').asfreq() + expected = Series([0, 20, 40, 60, 80], + index=pd.date_range('2012-01-01 00:00:00', + freq='20s', + periods=5)) + assert_series_equal(result, expected) + + def test_combined_up_downsampling_of_irregular(self): + + # since we are reallydoing an operation like this + # ts2.resample('2s').mean().ffill() + # preserve these semantics + + rng = pd.date_range('1/1/2012', periods=100, freq='S') + ts = pd.Series(np.arange(len(rng)), index=rng) + ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]] + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = ts2.resample('2s', how='mean', fill_method='ffill') + expected = ts2.resample('2s').mean().ffill() + assert_series_equal(result, expected) + + def test_transform(self): + + r = self.series.resample('20min') + expected = self.series.groupby( + pd.Grouper(freq='20min')).transform('mean') + result = r.transform('mean') + assert_series_equal(result, expected) + + def test_fillna(self): + + # need to upsample here + rng = pd.date_range('1/1/2012', periods=10, freq='2S') + ts = pd.Series(np.arange(len(rng), dtype='int64'), index=rng) + r = ts.resample('s') + + expected = r.ffill() + result = r.fillna(method='ffill') + assert_series_equal(result, expected) + + expected = r.bfill() + result = r.fillna(method='bfill') + assert_series_equal(result, expected) + + def test_apply_without_aggregation(self): + + # both resample and groupby should work w/o aggregation + r = self.series.resample('20min') + g = self.series.groupby(pd.Grouper(freq='20min')) + + for t in [g, r]: + result = t.apply(lambda x: x) + assert_series_equal(result, self.series) + + def test_agg(self): + # test with both a Resampler and a TimeGrouper + + np.random.seed(1234) + df = pd.DataFrame(np.random.rand(10, 2), + columns=list('AB'), + index=pd.date_range('2010-01-01 09:00:00', + periods=10, + freq='s')) + + r = df.resample('2s') + g = df.groupby(pd.Grouper(freq='2s')) + a_mean = r['A'].mean() + a_std = r['A'].std() + a_sum = r['A'].sum() + b_mean = r['B'].mean() + b_std = r['B'].std() + b_sum = r['B'].sum() + + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_product([['A', 'B'], + ['mean', 'std']]) + for t in [r, g]: + result = t.aggregate([np.mean, np.std]) + assert_frame_equal(result, expected) + + expected = pd.concat([a_mean, b_std], axis=1) + for t in [r, g]: + result = t.aggregate({'A': np.mean, + 'B': np.std}) + assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([a_mean, a_std], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), + ('A', 'std')]) + for t in [r, g]: + result = t.aggregate({'A': ['mean', 'std']}) + assert_frame_equal(result, expected) + + expected = pd.concat([a_mean, a_sum], axis=1) + expected.columns = ['mean', 'sum'] + for t in [r, g]: + result = t['A'].aggregate(['mean', 'sum']) + assert_frame_equal(result, expected) + + expected = pd.concat([a_mean, a_sum], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), + ('A', 'sum')]) + for t in [r, g]: + result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) + assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), + ('A', 'sum'), + ('B', 'mean2'), + ('B', 'sum2')]) + for t in [r, g]: + result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, + 'B': {'mean2': 'mean', 'sum2': 'sum'}}) + assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), + ('A', 'std'), + ('B', 'mean'), + ('B', 'std')]) + for t in [r, g]: + result = t.aggregate({'A': ['mean', 'std'], + 'B': ['mean', 'std']}) + assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'), + ('r1', 'A', 'sum'), + ('r2', 'B', 'mean'), + ('r2', 'B', 'sum')]) + + def test_agg_misc(self): + # test with both a Resampler and a TimeGrouper + + np.random.seed(1234) + df = pd.DataFrame(np.random.rand(10, 2), + columns=list('AB'), + index=pd.date_range('2010-01-01 09:00:00', + periods=10, + freq='s')) + + r = df.resample('2s') + g = df.groupby(pd.Grouper(freq='2s')) + + # passed lambda + for t in [r, g]: + result = t.agg({'A': np.sum, + 'B': lambda x: np.std(x, ddof=1)}) + rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) + expected = pd.concat([r['A'].sum(), rcustom], axis=1) + assert_frame_equal(result, expected, check_like=True) + + # agg with renamers + expected = pd.concat([t['A'].sum(), + t['B'].sum(), + t['A'].mean(), + t['B'].mean()], + axis=1) + expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'), + ('result1', 'B'), + ('result2', 'A'), + ('result2', 'B')]) + for t in [r, g]: + result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), + ('result2', np.mean)])) + assert_frame_equal(result, expected, check_like=True) + + # agg with different hows + expected = pd.concat([t['A'].sum(), + t['A'].std(), + t['B'].mean(), + t['B'].std()], + axis=1) + expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'), + ('A', 'std'), + ('B', 'mean'), + ('B', 'std')]) + for t in [r, g]: + result = t.agg(OrderedDict([('A', ['sum', 'std']), + ('B', ['mean', 'std'])])) + assert_frame_equal(result, expected, check_like=True) + + # equivalent of using a selection list / or not + for t in [r, g]: + result = g[['A', 'B']].agg({'A': ['sum', 'std'], + 'B': ['mean', 'std']}) + assert_frame_equal(result, expected, check_like=True) + + # series like aggs + expected = pd.concat([t['A'].sum(), + t['A'].std()], + axis=1) + expected.columns = ['sum', 'std'] + + for t in [r, g]: + result = r['A'].agg({'A': ['sum', 'std']}) + assert_frame_equal(result, expected, check_like=True) + + # errors + for t in [r, g]: + + # invalid names in the agg specification + def f(): + r['A'].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + self.assertRaises(SpecificationError, f) + + def f(): + r[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + self.assertRaises(SpecificationError, f) + + def test_agg_nested_dicts(self): + + np.random.seed(1234) + df = pd.DataFrame(np.random.rand(10, 2), + columns=list('AB'), + index=pd.date_range('2010-01-01 09:00:00', + periods=10, + freq='s')) + + r = df.resample('2s') + g = df.groupby(pd.Grouper(freq='2s')) + + for t in [r, g]: + def f(): + t.aggregate({'r1': {'A': ['mean', 'sum']}, + 'r2': {'B': ['mean', 'sum']}}) + self.assertRaises(ValueError, f) + + for t in [r, g]: + expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(), + t['B'].std()], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( + 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) + + result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) + assert_frame_equal(result, expected, check_like=True) + + result = t.agg({'A': {'ra': ['mean', 'std']}, + 'B': {'rb': ['mean', 'std']}}) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_consistency(self): + + # make sure that we are consistent across + # similar aggregations with and w/o selection list + df = DataFrame(np.random.randn(1000, 3), + index=pd.date_range('1/1/2012', freq='S', periods=1000), + columns=['A', 'B', 'C']) + + r = df.resample('3T') + + expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) + result = r.agg({'r1': 'mean', 'r2': 'sum'}) + assert_frame_equal(result, expected) + + class TestResample(tm.TestCase): _multiprocess_can_split_ = True @@ -84,7 +542,7 @@ def test_resample_basic(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min', name='index') s = Series(np.random.randn(14), index=rng) - result = s.resample('5min', how='mean', closed='right', label='right') + result = s.resample('5min', closed='right', label='right').mean() exp_idx = date_range('1/1/2000', periods=4, freq='5min', name='index') expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], @@ -92,7 +550,7 @@ def test_resample_basic(self): assert_series_equal(result, expected) self.assertEqual(result.index.name, 'index') - result = s.resample('5min', how='mean', closed='left', label='right') + result = s.resample('5min', closed='left', label='right').mean() exp_idx = date_range('1/1/2000 00:05', periods=3, freq='5min', name='index') @@ -101,7 +559,7 @@ def test_resample_basic(self): assert_series_equal(result, expected) s = self.series - result = s.resample('5Min', how='last') + result = s.resample('5Min').last() grouper = TimeGrouper(Minute(5), closed='left', label='left') expect = s.groupby(grouper).agg(lambda x: x[-1]) assert_series_equal(result, expect) @@ -131,8 +589,8 @@ def _ohlc(group): else: func = arg try: - result = s.resample('5min', how=arg, closed='right', - label='right') + result = getattr(s.resample( + '5min', closed='right', label='right'), arg)() expected = s.groupby(grouplist).agg(func) self.assertEqual(result.index.name, 'index') @@ -159,14 +617,15 @@ def fn(x, a=1): return str(type(x)) class fn_class: + def __call__(self, x): return str(type(x)) - df_standard = df.resample("M", how=fn) - df_lambda = df.resample("M", how=lambda x: str(type(x))) - df_partial = df.resample("M", how=partial(fn)) - df_partial2 = df.resample("M", how=partial(fn, a=2)) - df_class = df.resample("M", how=fn_class()) + df_standard = df.resample("M").apply(fn) + df_lambda = df.resample("M").apply(lambda x: str(type(x))) + df_partial = df.resample("M").apply(partial(fn)) + df_partial2 = df.resample("M").apply(partial(fn, a=2)) + df_class = df.resample("M").apply(fn_class()) assert_frame_equal(df_standard, df_lambda) assert_frame_equal(df_standard, df_partial) @@ -181,10 +640,32 @@ def test_resample_with_timedeltas(self): df = DataFrame({'A': np.arange(1480)}, index=pd.to_timedelta( np.arange(1480), unit='T')) - result = df.resample('30T', how='sum') + result = df.resample('30T').sum() assert_frame_equal(result, expected) + s = df['A'] + result = s.resample('30T').sum() + assert_series_equal(result, expected['A']) + + def test_resample_single_period_timedelta(self): + + s = Series(list(range(5)), index=pd.timedelta_range( + '1 day', freq='s', periods=5)) + result = s.resample('2s').sum() + expected = Series([1, 5, 4], index=pd.timedelta_range( + '1 day', freq='2s', periods=3)) + assert_series_equal(result, expected) + + def test_resample_timedelta_idempotency(self): + + # GH 12072 + index = pd.timedelta_range('0', periods=9, freq='10L') + series = pd.Series(range(9), index=index) + result = series.resample('10L').mean() + expected = series + assert_series_equal(result, expected) + def test_resample_rounding(self): # GH 8371 # odd results when rounding is needed @@ -214,31 +695,31 @@ def test_resample_rounding(self): df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [ 'date', 'time']}, index_col='timestamp') df.index.name = None - result = df.resample('6s', how='sum') + result = df.resample('6s').sum() expected = DataFrame({'value': [ 4, 9, 4, 2 ]}, index=date_range('2014-11-08', freq='6s', periods=4)) assert_frame_equal(result, expected) - result = df.resample('7s', how='sum') + result = df.resample('7s').sum() expected = DataFrame({'value': [ 4, 10, 4, 1 ]}, index=date_range('2014-11-08', freq='7s', periods=4)) assert_frame_equal(result, expected) - result = df.resample('11s', how='sum') + result = df.resample('11s').sum() expected = DataFrame({'value': [ 11, 8 ]}, index=date_range('2014-11-08', freq='11s', periods=2)) assert_frame_equal(result, expected) - result = df.resample('13s', how='sum') + result = df.resample('13s').sum() expected = DataFrame({'value': [ 13, 6 ]}, index=date_range('2014-11-08', freq='13s', periods=2)) assert_frame_equal(result, expected) - result = df.resample('17s', how='sum') + result = df.resample('17s').sum() expected = DataFrame({'value': [ 16, 3 ]}, index=date_range('2014-11-08', freq='17s', periods=2)) @@ -252,7 +733,7 @@ def test_resample_basic_from_daily(self): s = Series(np.random.rand(len(dti)), dti) # to weekly - result = s.resample('w-sun', how='last') + result = s.resample('w-sun').last() self.assertEqual(len(result), 3) self.assertTrue((result.index.dayofweek == [6, 6, 6]).all()) @@ -260,38 +741,38 @@ def test_resample_basic_from_daily(self): self.assertEqual(result.iloc[1], s['1/9/2005']) self.assertEqual(result.iloc[2], s.iloc[-1]) - result = s.resample('W-MON', how='last') + result = s.resample('W-MON').last() self.assertEqual(len(result), 2) self.assertTrue((result.index.dayofweek == [0, 0]).all()) self.assertEqual(result.iloc[0], s['1/3/2005']) self.assertEqual(result.iloc[1], s['1/10/2005']) - result = s.resample('W-TUE', how='last') + result = s.resample('W-TUE').last() self.assertEqual(len(result), 2) self.assertTrue((result.index.dayofweek == [1, 1]).all()) self.assertEqual(result.iloc[0], s['1/4/2005']) self.assertEqual(result.iloc[1], s['1/10/2005']) - result = s.resample('W-WED', how='last') + result = s.resample('W-WED').last() self.assertEqual(len(result), 2) self.assertTrue((result.index.dayofweek == [2, 2]).all()) self.assertEqual(result.iloc[0], s['1/5/2005']) self.assertEqual(result.iloc[1], s['1/10/2005']) - result = s.resample('W-THU', how='last') + result = s.resample('W-THU').last() self.assertEqual(len(result), 2) self.assertTrue((result.index.dayofweek == [3, 3]).all()) self.assertEqual(result.iloc[0], s['1/6/2005']) self.assertEqual(result.iloc[1], s['1/10/2005']) - result = s.resample('W-FRI', how='last') + result = s.resample('W-FRI').last() self.assertEqual(len(result), 2) self.assertTrue((result.index.dayofweek == [4, 4]).all()) self.assertEqual(result.iloc[0], s['1/7/2005']) self.assertEqual(result.iloc[1], s['1/10/2005']) # to biz day - result = s.resample('B', how='last') + result = s.resample('B').last() self.assertEqual(len(result), 7) self.assertTrue((result.index.dayofweek == [ 4, 0, 1, 2, 3, 4, 0 @@ -307,7 +788,7 @@ def test_resample_upsampling_picked_but_not_correct(self): dates = date_range('01-Jan-2014', '05-Jan-2014', freq='D') series = Series(1, index=dates) - result = series.resample('D') + result = series.resample('D').mean() self.assertEqual(result.index[0], dates[0]) # GH 5955 @@ -320,15 +801,13 @@ def test_resample_upsampling_picked_but_not_correct(self): expected = Series(np.arange(1., 6), index=date_range( '19750101', periods=5, freq='D')) - result = s.resample('D', how='count') + result = s.resample('D').count() assert_series_equal(result, Series(1, index=expected.index)) - result1 = s.resample('D', how='sum') - result2 = s.resample('D', how='mean') - result3 = s.resample('D') + result1 = s.resample('D').sum() + result2 = s.resample('D').mean() assert_series_equal(result1, expected) assert_series_equal(result2, expected) - assert_series_equal(result3, expected) def test_resample_frame_basic(self): df = tm.makeTimeDataFrame() @@ -341,32 +820,34 @@ def test_resample_frame_basic(self): for f in funcs: g._cython_agg_general(f) - result = df.resample('A') - assert_series_equal(result['A'], df['A'].resample('A')) + result = df.resample('A').mean() + assert_series_equal(result['A'], df['A'].resample('A').mean()) - result = df.resample('M') - assert_series_equal(result['A'], df['A'].resample('M')) + result = df.resample('M').mean() + assert_series_equal(result['A'], df['A'].resample('M').mean()) - df.resample('M', kind='period') - df.resample('W-WED', kind='period') + df.resample('M', kind='period').mean() + df.resample('W-WED', kind='period').mean() def test_resample_loffset(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') s = Series(np.random.randn(14), index=rng) - result = s.resample('5min', how='mean', closed='right', label='right', - loffset=timedelta(minutes=1)) + result = s.resample('5min', closed='right', label='right', + loffset=timedelta(minutes=1)).mean() idx = date_range('1/1/2000', periods=4, freq='5min') expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], index=idx + timedelta(minutes=1)) assert_series_equal(result, expected) - expected = s.resample('5min', how='mean', closed='right', - label='right', loffset='1min') + expected = s.resample( + '5min', closed='right', label='right', + loffset='1min').mean() assert_series_equal(result, expected) - expected = s.resample('5min', how='mean', closed='right', - label='right', loffset=Minute(1)) + expected = s.resample( + '5min', closed='right', label='right', + loffset=Minute(1)).mean() assert_series_equal(result, expected) self.assertEqual(result.index.freq, Minute(5)) @@ -377,8 +858,8 @@ def test_resample_loffset(self): ser = Series(np.random.rand(len(dti)), dti) # to weekly - result = ser.resample('w-sun', how='last') - expected = ser.resample('w-sun', how='last', loffset=-bday) + result = ser.resample('w-sun').last() + expected = ser.resample('w-sun', loffset=-bday).last() self.assertEqual(result.index[0] - bday, expected.index[0]) def test_resample_upsample(self): @@ -389,7 +870,7 @@ def test_resample_upsample(self): s = Series(np.random.rand(len(dti)), dti) # to minutely, by padding - result = s.resample('Min', fill_method='pad') + result = s.resample('Min').pad() self.assertEqual(len(result), 12961) self.assertEqual(result[0], s[0]) self.assertEqual(result[-1], s[-1]) @@ -404,14 +885,14 @@ def test_resample_extra_index_point(self): index = DatetimeIndex(start='20150101', end='20150331', freq='B') df = DataFrame( {'A': Series(range(len(index)), index=index)}, dtype='int64') - result = df.resample('BM', how='last') + result = df.resample('BM').last() assert_frame_equal(result, expected) def test_upsample_with_limit(self): rng = date_range('1/1/2000', periods=3, freq='5t') ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('t', fill_method='ffill', limit=2) + result = ts.resample('t').ffill(limit=2) expected = ts.reindex(result.index, method='ffill', limit=2) assert_series_equal(result, expected) @@ -420,7 +901,7 @@ def test_resample_ohlc(self): grouper = TimeGrouper(Minute(5)) expect = s.groupby(grouper).agg(lambda x: x[-1]) - result = s.resample('5Min', how='ohlc') + result = s.resample('5Min').ohlc() self.assertEqual(len(result), len(expect)) self.assertEqual(len(result.columns), 4) @@ -449,15 +930,15 @@ def test_resample_ohlc_dataframe(self): Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, Timestamp('2011-01-06 12:54:09', tz=None): 100000000}}) ).reindex_axis(['VOLUME', 'PRICE'], axis=1) - res = df.resample('H', how='ohlc') - exp = pd.concat([df['VOLUME'].resample('H', how='ohlc'), - df['PRICE'].resample('H', how='ohlc')], + res = df.resample('H').ohlc() + exp = pd.concat([df['VOLUME'].resample('H').ohlc(), + df['PRICE'].resample('H').ohlc()], axis=1, keys=['VOLUME', 'PRICE']) assert_frame_equal(exp, res) df.columns = [['a', 'b'], ['c', 'd']] - res = df.resample('H', how='ohlc') + res = df.resample('H').ohlc() exp.columns = pd.MultiIndex.from_tuples([('a', 'c', 'open'), ( 'a', 'c', 'high'), ('a', 'c', 'low'), ('a', 'c', 'close'), ( 'b', 'd', 'open'), ('b', 'd', 'high'), ('b', 'd', 'low'), ( @@ -475,7 +956,7 @@ def test_resample_dup_index(self): columns=[Period(year=2000, month=i + 1, freq='M') for i in range(12)]) df.iloc[3, :] = np.nan - result = df.resample('Q', axis=1) + result = df.resample('Q', axis=1).mean() expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean() expected.columns = [ Period(year=2000, quarter=i + 1, freq='Q') for i in range(4)] @@ -485,8 +966,8 @@ def test_resample_reresample(self): dti = DatetimeIndex(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq='D') s = Series(np.random.rand(len(dti)), dti) - bs = s.resample('B', closed='right', label='right') - result = bs.resample('8H') + bs = s.resample('B', closed='right', label='right').mean() + result = bs.resample('8H').mean() self.assertEqual(len(result), 22) tm.assertIsInstance(result.index.freq, offsets.DateOffset) self.assertEqual(result.index.freq, offsets.Hour(8)) @@ -494,23 +975,23 @@ def test_resample_reresample(self): def test_resample_timestamp_to_period(self): ts = _simple_ts('1/1/1990', '1/1/2000') - result = ts.resample('A-DEC', kind='period') - expected = ts.resample('A-DEC') + result = ts.resample('A-DEC', kind='period').mean() + expected = ts.resample('A-DEC').mean() expected.index = period_range('1990', '2000', freq='a-dec') assert_series_equal(result, expected) - result = ts.resample('A-JUN', kind='period') - expected = ts.resample('A-JUN') + result = ts.resample('A-JUN', kind='period').mean() + expected = ts.resample('A-JUN').mean() expected.index = period_range('1990', '2000', freq='a-jun') assert_series_equal(result, expected) - result = ts.resample('M', kind='period') - expected = ts.resample('M') + result = ts.resample('M', kind='period').mean() + expected = ts.resample('M').mean() expected.index = period_range('1990-01', '2000-01', freq='M') assert_series_equal(result, expected) - result = ts.resample('M', kind='period') - expected = ts.resample('M') + result = ts.resample('M', kind='period').mean() + expected = ts.resample('M').mean() expected.index = period_range('1990-01', '2000-01', freq='M') assert_series_equal(result, expected) @@ -523,8 +1004,8 @@ def _ohlc(group): rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50', freq='10s') ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('5min', how='ohlc', closed='right', - label='right') + resampled = ts.resample('5min', closed='right', + label='right').ohlc() self.assertTrue((resampled.ix['1/1/2000 00:00'] == ts[0]).all()) @@ -539,7 +1020,7 @@ def test_downsample_non_unique(self): rng2 = rng.repeat(5).values ts = Series(np.random.randn(len(rng2)), index=rng2) - result = ts.resample('M', how='mean') + result = ts.resample('M').mean() expected = ts.groupby(lambda x: x.month).mean() self.assertEqual(len(result), 2) @@ -559,8 +1040,8 @@ def test_resample_axis1(self): df = DataFrame(np.random.randn(3, len(rng)), columns=rng, index=['a', 'b', 'c']) - result = df.resample('M', axis=1) - expected = df.T.resample('M').T + result = df.resample('M', axis=1).mean() + expected = df.T.resample('M').mean().T tm.assert_frame_equal(result, expected) def test_resample_panel(self): @@ -572,7 +1053,7 @@ def test_resample_panel(self): major_axis=rng, minor_axis=['a', 'b', 'c', 'd', 'e']) - result = panel.resample('M', axis=1) + result = panel.resample('M', axis=1).mean() def p_apply(panel, f): result = {} @@ -580,12 +1061,12 @@ def p_apply(panel, f): result[item] = f(panel[item]) return Panel(result, items=panel.items) - expected = p_apply(panel, lambda x: x.resample('M')) + expected = p_apply(panel, lambda x: x.resample('M').mean()) tm.assert_panel_equal(result, expected) panel2 = panel.swapaxes(1, 2) - result = panel2.resample('M', axis=2) - expected = p_apply(panel2, lambda x: x.resample('M', axis=1)) + result = panel2.resample('M', axis=2).mean() + expected = p_apply(panel2, lambda x: x.resample('M', axis=1).mean()) tm.assert_panel_equal(result, expected) def test_resample_panel_numpy(self): @@ -597,13 +1078,13 @@ def test_resample_panel_numpy(self): major_axis=rng, minor_axis=['a', 'b', 'c', 'd', 'e']) - result = panel.resample('M', how=lambda x: x.mean(1), axis=1) - expected = panel.resample('M', how='mean', axis=1) + result = panel.resample('M', axis=1).apply(lambda x: x.mean(1)) + expected = panel.resample('M', axis=1).mean() tm.assert_panel_equal(result, expected) panel = panel.swapaxes(1, 2) - result = panel.resample('M', how=lambda x: x.mean(2), axis=2) - expected = panel.resample('M', how='mean', axis=2) + result = panel.resample('M', axis=2).apply(lambda x: x.mean(2)) + expected = panel.resample('M', axis=2).mean() tm.assert_panel_equal(result, expected) def test_resample_anchored_ticks(self): @@ -618,8 +1099,8 @@ def test_resample_anchored_ticks(self): freqs = ['t', '5t', '15t', '30t', '4h', '12h'] for freq in freqs: - result = ts[2:].resample(freq, closed='left', label='left') - expected = ts.resample(freq, closed='left', label='left') + result = ts[2:].resample(freq, closed='left', label='left').mean() + expected = ts.resample(freq, closed='left', label='left').mean() assert_series_equal(result, expected) def test_resample_single_group(self): @@ -627,26 +1108,26 @@ def test_resample_single_group(self): rng = date_range('2000-1-1', '2000-2-10', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) - assert_series_equal(ts.resample('M', how='sum'), - ts.resample('M', how=mysum)) + assert_series_equal(ts.resample('M').sum(), + ts.resample('M').apply(mysum)) rng = date_range('2000-1-1', '2000-1-10', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) - assert_series_equal(ts.resample('M', how='sum'), - ts.resample('M', how=mysum)) + assert_series_equal(ts.resample('M').sum(), + ts.resample('M').apply(mysum)) # GH 3849 s = Series([30.1, 31.6], index=[Timestamp('20070915 15:30:00'), Timestamp('20070915 15:40:00')]) expected = Series([0.75], index=[Timestamp('20070915')]) - result = s.resample('D', how=lambda x: np.std(x)) + result = s.resample('D').apply(lambda x: np.std(x)) assert_series_equal(result, expected) def test_resample_base(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s') ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('5min', base=2) + resampled = ts.resample('5min', base=2).mean() exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57', freq='5min') self.assertTrue(resampled.index.equals(exp_rng)) @@ -657,8 +1138,8 @@ def test_resample_base_with_timedeltaindex(self): rng = timedelta_range(start='0s', periods=25, freq='s') ts = Series(np.random.randn(len(rng)), index=rng) - with_base = ts.resample('2s', base=5) - without_base = ts.resample('2s') + with_base = ts.resample('2s', base=5).mean() + without_base = ts.resample('2s').mean() exp_without_base = timedelta_range(start='0s', end='25s', freq='2s') exp_with_base = timedelta_range(start='5s', end='29s', freq='2s') @@ -671,8 +1152,8 @@ def test_resample_daily_anchored(self): ts = Series(np.random.randn(len(rng)), index=rng) ts[:2] = np.nan # so results are the same - result = ts[2:].resample('D', closed='left', label='left') - expected = ts.resample('D', closed='left', label='left') + result = ts[2:].resample('D', closed='left', label='left').mean() + expected = ts.resample('D', closed='left', label='left').mean() assert_series_equal(result, expected) def test_resample_to_period_monthly_buglet(self): @@ -681,24 +1162,37 @@ def test_resample_to_period_monthly_buglet(self): rng = date_range('1/1/2000', '12/31/2000') ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('M', kind='period') + result = ts.resample('M', kind='period').mean() exp_index = period_range('Jan-2000', 'Dec-2000', freq='M') self.assertTrue(result.index.equals(exp_index)) + def test_period_with_agg(self): + + # aggregate a period resampler with a lambda + s2 = pd.Series(np.random.randint(0, 5, 50), + index=pd.period_range('2012-01-01', + freq='H', + periods=50), + dtype='float64') + + expected = s2.to_timestamp().resample('D').mean().to_period() + result = s2.resample('D').agg(lambda x: x.mean()) + assert_series_equal(result, expected) + def test_resample_empty(self): ts = _simple_ts('1/1/2000', '2/1/2000')[:0] - result = ts.resample('A') + result = ts.resample('A').mean() self.assertEqual(len(result), 0) self.assertEqual(result.index.freqstr, 'A-DEC') - result = ts.resample('A', kind='period') + result = ts.resample('A', kind='period').mean() self.assertEqual(len(result), 0) self.assertEqual(result.index.freqstr, 'A-DEC') + # this is a non datetimelike index xp = DataFrame() - rs = xp.resample('A') - assert_frame_equal(xp, rs) + self.assertRaises(TypeError, lambda: xp.resample('A').mean()) # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run @@ -708,7 +1202,7 @@ def test_resample_empty(self): for how in ('count', 'mean', 'min', 'ohlc', 'last', 'prod'): empty_series = pd.Series([], index, dtype) try: - empty_series.resample('d', how) + getattr(empty_series.resample('d'), how)() except DataError: # Ignore these since some combinations are invalid # (ex: doing mean with dtype of np.object) @@ -719,8 +1213,8 @@ def test_weekly_resample_buglet(self): rng = date_range('1/1/2000', freq='B', periods=20) ts = Series(np.random.randn(len(rng)), index=rng) - resampled = ts.resample('W') - expected = ts.resample('W-SUN') + resampled = ts.resample('W').mean() + expected = ts.resample('W-SUN').mean() assert_series_equal(resampled, expected) def test_monthly_resample_error(self): @@ -736,12 +1230,13 @@ def test_resample_anchored_intraday(self): rng = date_range('1/1/2012', '4/1/2012', freq='100min') df = DataFrame(rng.month, index=rng) - result = df.resample('M') - expected = df.resample('M', kind='period').to_timestamp(how='end') + result = df.resample('M').mean() + expected = df.resample( + 'M', kind='period').mean().to_timestamp(how='end') tm.assert_frame_equal(result, expected) - result = df.resample('M', closed='left') - exp = df.tshift(1, freq='D').resample('M', kind='period') + result = df.resample('M', closed='left').mean() + exp = df.tshift(1, freq='D').resample('M', kind='period').mean() exp = exp.to_timestamp(how='end') tm.assert_frame_equal(result, exp) @@ -749,18 +1244,19 @@ def test_resample_anchored_intraday(self): rng = date_range('1/1/2012', '4/1/2012', freq='100min') df = DataFrame(rng.month, index=rng) - result = df.resample('Q') - expected = df.resample('Q', kind='period').to_timestamp(how='end') + result = df.resample('Q').mean() + expected = df.resample( + 'Q', kind='period').mean().to_timestamp(how='end') tm.assert_frame_equal(result, expected) - result = df.resample('Q', closed='left') + result = df.resample('Q', closed='left').mean() expected = df.tshift(1, freq='D').resample('Q', kind='period', - closed='left') + closed='left').mean() expected = expected.to_timestamp(how='end') tm.assert_frame_equal(result, expected) ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h') - resampled = ts.resample('M') + resampled = ts.resample('M').mean() self.assertEqual(len(resampled), 1) def test_resample_anchored_monthstart(self): @@ -769,7 +1265,7 @@ def test_resample_anchored_monthstart(self): freqs = ['MS', 'BMS', 'QS-MAR', 'AS-DEC', 'AS-JUN'] for freq in freqs: - result = ts.resample(freq, how='mean') # noqa + ts.resample(freq).mean() def test_resample_anchored_multiday(self): # When resampling a range spanning multiple days, ensure that the @@ -785,12 +1281,12 @@ def test_resample_anchored_multiday(self): periods=2, freq='2200L')) # Ensure left closing works - result = s.resample('2200L', 'mean') + result = s.resample('2200L').mean() self.assertEqual(result.index[-1], pd.Timestamp('2014-10-15 23:00:02.000')) # Ensure right closing works - result = s.resample('2200L', 'mean', label='right') + result = s.resample('2200L', label='right').mean() self.assertEqual(result.index[-1], pd.Timestamp('2014-10-15 23:00:04.200')) @@ -800,18 +1296,18 @@ def test_corner_cases(self): rng = date_range('1/1/2000', periods=12, freq='t') ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('5t', closed='right', label='left') + result = ts.resample('5t', closed='right', label='left').mean() ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t') self.assertTrue(result.index.equals(ex_index)) len0pts = _simple_pts('2007-01', '2010-05', freq='M')[:0] # it works - result = len0pts.resample('A-DEC') + result = len0pts.resample('A-DEC').mean() self.assertEqual(len(result), 0) # resample to periods ts = _simple_ts('2000-04-28', '2000-04-30 11:00', freq='h') - result = ts.resample('M', kind='period') + result = ts.resample('M', kind='period').mean() self.assertEqual(len(result), 1) self.assertEqual(result.index[0], Period('2000-04', freq='M')) @@ -819,7 +1315,7 @@ def test_anchored_lowercase_buglet(self): dates = date_range('4/16/2012 20:00', periods=50000, freq='s') ts = Series(np.random.randn(len(dates)), index=dates) # it works! - ts.resample('d') + ts.resample('d').mean() def test_upsample_apply_functions(self): # #1596 @@ -827,7 +1323,7 @@ def test_upsample_apply_functions(self): ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('20min', how=['mean', 'sum']) + result = ts.resample('20min').aggregate(['mean', 'sum']) tm.assertIsInstance(result, DataFrame) def test_resample_not_monotonic(self): @@ -836,8 +1332,8 @@ def test_resample_not_monotonic(self): ts = ts.take(np.random.permutation(len(ts))) - result = ts.resample('D', how='sum') - exp = ts.sort_index().resample('D', how='sum') + result = ts.resample('D').sum() + exp = ts.sort_index().resample('D').sum() assert_series_equal(result, exp) def test_resample_median_bug_1688(self): @@ -847,11 +1343,11 @@ def test_resample_median_bug_1688(self): datetime(2012, 1, 1, 0, 5, 0)], dtype=dtype) - result = df.resample("T", how=lambda x: x.mean()) + result = df.resample("T").apply(lambda x: x.mean()) exp = df.asfreq('T') tm.assert_frame_equal(result, exp) - result = df.resample("T", how="median") + result = df.resample("T").median() exp = df.asfreq('T') tm.assert_frame_equal(result, exp) @@ -859,20 +1355,23 @@ def test_how_lambda_functions(self): ts = _simple_ts('1/1/2000', '4/1/2000') - result = ts.resample('M', how=lambda x: x.mean()) - exp = ts.resample('M', how='mean') + result = ts.resample('M').apply(lambda x: x.mean()) + exp = ts.resample('M').mean() tm.assert_series_equal(result, exp) - self.assertRaises(Exception, ts.resample, 'M', - how=[lambda x: x.mean(), lambda x: x.std(ddof=1)]) - - result = ts.resample('M', how={'foo': lambda x: x.mean(), - 'bar': lambda x: x.std(ddof=1)}) - foo_exp = ts.resample('M', how='mean') + foo_exp = ts.resample('M').mean() foo_exp.name = 'foo' - bar_exp = ts.resample('M', how='std') + bar_exp = ts.resample('M').std() bar_exp.name = 'bar' + result = ts.resample('M').apply( + [lambda x: x.mean(), lambda x: x.std(ddof=1)]) + result.columns = ['foo', 'bar'] + tm.assert_series_equal(result['foo'], foo_exp) + tm.assert_series_equal(result['bar'], bar_exp) + + result = ts.resample('M').aggregate({'foo': lambda x: x.mean(), + 'bar': lambda x: x.std(ddof=1)}) tm.assert_series_equal(result['foo'], foo_exp) tm.assert_series_equal(result['bar'], bar_exp) @@ -885,7 +1384,7 @@ def test_resample_unequal_times(self): df = DataFrame({'close': 1}, index=bad_ind) # it works! - df.resample('AS', 'sum') + df.resample('AS').sum() def test_resample_consistency(self): @@ -902,8 +1401,8 @@ def test_resample_consistency(self): s10 = s.reindex(index=i10, method='bfill') s10_2 = s.reindex(index=i10, method='bfill', limit=2) rl = s.reindex_like(s10, method='bfill', limit=2) - r10_2 = s.resample('10Min', fill_method='bfill', limit=2) - r10 = s.resample('10Min', fill_method='bfill') + r10_2 = s.resample('10Min').bfill(limit=2) + r10 = s.resample('10Min').bfill() # s10_2, r10, r10_2, rl should all be equal assert_series_equal(s10_2, r10) @@ -921,7 +1420,7 @@ def test_resample_timegrouper(self): for dates in [dates1, dates2, dates3]: df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) - result = df.set_index('A').resample('M', how='count') + result = df.set_index('A').resample('M').count() exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30', '2014-10-31', '2014-11-30'], @@ -934,7 +1433,7 @@ def test_resample_timegrouper(self): df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange( len(dates)))) - result = df.set_index('A').resample('M', how='count') + result = df.set_index('A').resample('M').count() expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, index=exp_idx, columns=['B', 'C']) assert_frame_equal(result, expected) @@ -948,7 +1447,7 @@ def test_resample_group_info(self): # GH10914 ts = Series(np.random.randint(0, n // k, n).astype('int64'), index=np.random.choice(dr, n)) - left = ts.resample('30T', how='nunique') + left = ts.resample('30T').nunique() ix = date_range(start=ts.index.min(), end=ts.index.max(), freq='30T') @@ -972,7 +1471,7 @@ def test_resample_size(self): dr = date_range('2015-09-19', periods=n, freq='T') ts = Series(np.random.randn(n), index=np.random.choice(dr, n)) - left = ts.resample('7T', how='size') + left = ts.resample('7T').size() ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T') bins = np.searchsorted(ix.values, ts.index.values, side='right') @@ -986,11 +1485,11 @@ def test_resmaple_dst_anchor(self): # 5172 dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern') df = DataFrame([5], index=dti) - assert_frame_equal(df.resample(rule='D', how='sum'), + assert_frame_equal(df.resample(rule='D').sum(), DataFrame([5], index=df.index.normalize())) - df.resample(rule='MS', how='sum') + df.resample(rule='MS').sum() assert_frame_equal( - df.resample(rule='MS', how='sum'), + df.resample(rule='MS').sum(), DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)], tz='US/Eastern'))) @@ -1003,7 +1502,7 @@ def test_resmaple_dst_anchor(self): how = {"a": "min", "b": "max", "c": "count"} assert_frame_equal( - df.resample("W-MON", how=how)[["a", "b", "c"]], + df.resample("W-MON").agg(how)[["a", "b", "c"]], DataFrame({"a": [0, 48, 384, 720, 1056, 1394], "b": [47, 383, 719, 1055, 1393, 1586], "c": [48, 336, 336, 336, 338, 193]}, @@ -1012,7 +1511,7 @@ def test_resmaple_dst_anchor(self): 'W-MON Frequency') assert_frame_equal( - df.resample("2W-MON", how=how)[["a", "b", "c"]], + df.resample("2W-MON").agg(how)[["a", "b", "c"]], DataFrame({"a": [0, 48, 720, 1394], "b": [47, 719, 1393, 1586], "c": [48, 672, 674, 193]}, @@ -1021,7 +1520,7 @@ def test_resmaple_dst_anchor(self): '2W-MON Frequency') assert_frame_equal( - df.resample("MS", how=how)[["a", "b", "c"]], + df.resample("MS").agg(how)[["a", "b", "c"]], DataFrame({"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]}, @@ -1030,7 +1529,7 @@ def test_resmaple_dst_anchor(self): 'MS Frequency') assert_frame_equal( - df.resample("2MS", how=how)[["a", "b", "c"]], + df.resample("2MS").agg(how)[["a", "b", "c"]], DataFrame({"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]}, @@ -1040,7 +1539,7 @@ def test_resmaple_dst_anchor(self): df_daily = df['10/26/2013':'10/29/2013'] assert_frame_equal( - df_daily.resample("D", how={"a": "min", "b": "max", "c": "count"}) + df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"}) [["a", "b", "c"]], DataFrame({"a": [1248, 1296, 1346, 1394], "b": [1295, 1345, 1393, 1441], @@ -1104,36 +1603,36 @@ def _check_annual_upsample_cases(self, targ, conv, meth, end='12/31/1991'): for month in MONTHS: ts = _simple_pts('1/1/1990', end, freq='A-%s' % month) - result = ts.resample(targ, fill_method=meth, convention=conv) + result = getattr(ts.resample(targ, convention=conv), meth)() expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, meth).to_period() assert_series_equal(result, expected) def test_basic_downsample(self): ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') - result = ts.resample('a-dec') + result = ts.resample('a-dec').mean() expected = ts.groupby(ts.index.year).mean() expected.index = period_range('1/1/1990', '6/30/1995', freq='a-dec') assert_series_equal(result, expected) # this is ok - assert_series_equal(ts.resample('a-dec'), result) - assert_series_equal(ts.resample('a'), result) + assert_series_equal(ts.resample('a-dec').mean(), result) + assert_series_equal(ts.resample('a').mean(), result) def test_not_subperiod(self): # These are incompatible period rules for resampling ts = _simple_pts('1/1/1990', '6/30/1995', freq='w-wed') - self.assertRaises(ValueError, ts.resample, 'a-dec') - self.assertRaises(ValueError, ts.resample, 'q-mar') - self.assertRaises(ValueError, ts.resample, 'M') - self.assertRaises(ValueError, ts.resample, 'w-thu') + self.assertRaises(ValueError, lambda: ts.resample('a-dec').mean()) + self.assertRaises(ValueError, lambda: ts.resample('q-mar').mean()) + self.assertRaises(ValueError, lambda: ts.resample('M').mean()) + self.assertRaises(ValueError, lambda: ts.resample('w-thu').mean()) def test_basic_upsample(self): ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') - result = ts.resample('a-dec') + result = ts.resample('a-dec').mean() - resampled = result.resample('D', fill_method='ffill', convention='end') + resampled = result.resample('D', convention='end').ffill() expected = result.to_timestamp('D', how='end') expected = expected.asfreq('D', 'ffill').to_period() @@ -1144,8 +1643,7 @@ def test_upsample_with_limit(self): rng = period_range('1/1/2000', periods=5, freq='A') ts = Series(np.random.randn(len(rng)), rng) - result = ts.resample('M', fill_method='ffill', limit=2, - convention='end') + result = ts.resample('M', convention='end').ffill(limit=2) expected = ts.asfreq('M').reindex(result.index, method='ffill', limit=2) assert_series_equal(result, expected) @@ -1153,14 +1651,14 @@ def test_upsample_with_limit(self): def test_annual_upsample(self): ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-DEC') df = DataFrame({'a': ts}) - rdf = df.resample('D', fill_method='ffill') - exp = df['a'].resample('D', fill_method='ffill') + rdf = df.resample('D').ffill() + exp = df['a'].resample('D').ffill() assert_series_equal(rdf['a'], exp) rng = period_range('2000', '2003', freq='A-DEC') ts = Series([1, 2, 3, 4], index=rng) - result = ts.resample('M', fill_method='ffill') + result = ts.resample('M').ffill() ex_index = period_range('2000-01', '2003-12', freq='M') expected = ts.asfreq('M', how='start').reindex(ex_index, @@ -1174,8 +1672,7 @@ def test_quarterly_upsample(self): ts = _simple_pts('1/1/1990', '12/31/1995', freq='Q-%s' % month) for targ, conv in product(targets, ['start', 'end']): - result = ts.resample(targ, fill_method='ffill', - convention=conv) + result = ts.resample(targ, convention=conv).ffill() expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, 'ffill').to_period() assert_series_equal(result, expected) @@ -1186,7 +1683,7 @@ def test_monthly_upsample(self): ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') for targ, conv in product(targets, ['start', 'end']): - result = ts.resample(targ, fill_method='ffill', convention=conv) + result = ts.resample(targ, convention=conv).ffill() expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, 'ffill').to_period() assert_series_equal(result, expected) @@ -1195,8 +1692,8 @@ def test_fill_method_and_how_upsample(self): # GH2073 s = Series(np.arange(9, dtype='int64'), index=date_range('2010-01-01', periods=9, freq='Q')) - last = s.resample('M', fill_method='ffill') - both = s.resample('M', how='last', fill_method='ffill').astype('int64') + last = s.resample('M').ffill() + both = s.resample('M').ffill().resample('M').last().astype('int64') assert_series_equal(last, both) def test_weekly_upsample(self): @@ -1206,8 +1703,7 @@ def test_weekly_upsample(self): ts = _simple_pts('1/1/1990', '12/31/1995', freq='W-%s' % day) for targ, conv in product(targets, ['start', 'end']): - result = ts.resample(targ, fill_method='ffill', - convention=conv) + result = ts.resample(targ, convention=conv).ffill() expected = result.to_timestamp(targ, how=conv) expected = expected.asfreq(targ, 'ffill').to_period() assert_series_equal(result, expected) @@ -1215,14 +1711,14 @@ def test_weekly_upsample(self): def test_resample_to_timestamps(self): ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') - result = ts.resample('A-DEC', kind='timestamp') - expected = ts.to_timestamp(how='end').resample('A-DEC') + result = ts.resample('A-DEC', kind='timestamp').mean() + expected = ts.to_timestamp(how='end').resample('A-DEC').mean() assert_series_equal(result, expected) def test_resample_to_quarterly(self): for month in MONTHS: ts = _simple_pts('1990', '1992', freq='A-%s' % month) - quar_ts = ts.resample('Q-%s' % month, fill_method='ffill') + quar_ts = ts.resample('Q-%s' % month).ffill() stamps = ts.to_timestamp('D', how='start') qdates = period_range(ts.index[0].asfreq('D', 'start'), @@ -1239,12 +1735,12 @@ def test_resample_to_quarterly(self): ts = _simple_pts('1990', '1992', freq='A-JUN') for how in ['start', 'end']: - result = ts.resample('Q-MAR', convention=how, fill_method='ffill') + result = ts.resample('Q-MAR', convention=how).ffill() expected = ts.asfreq('Q-MAR', how=how) expected = expected.reindex(result.index, method='ffill') # .to_timestamp('D') - # expected = expected.resample('Q-MAR', fill_method='ffill') + # expected = expected.resample('Q-MAR').ffill() assert_series_equal(result, expected) @@ -1254,37 +1750,32 @@ def test_resample_fill_missing(self): s = Series(np.random.randn(4), index=rng) stamps = s.to_timestamp() - - filled = s.resample('A') - expected = stamps.resample('A').to_period('A') - assert_series_equal(filled, expected) - - filled = s.resample('A', fill_method='ffill') - expected = stamps.resample('A', fill_method='ffill').to_period('A') + filled = s.resample('A').ffill() + expected = stamps.resample('A').ffill().to_period('A') assert_series_equal(filled, expected) def test_cant_fill_missing_dups(self): rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq='A') s = Series(np.random.randn(5), index=rng) - self.assertRaises(Exception, s.resample, 'A') + self.assertRaises(Exception, lambda: s.resample('A').ffill()) def test_resample_5minute(self): rng = period_range('1/1/2000', '1/5/2000', freq='T') ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('5min') - expected = ts.to_timestamp().resample('5min') + result = ts.resample('5min').mean() + expected = ts.to_timestamp().resample('5min').mean() assert_series_equal(result, expected) def test_upsample_daily_business_daily(self): ts = _simple_pts('1/1/2000', '2/1/2000', freq='B') - result = ts.resample('D') + result = ts.resample('D').asfreq() expected = ts.asfreq('D').reindex(period_range('1/3/2000', '2/1/2000')) assert_series_equal(result, expected) ts = _simple_pts('1/1/2000', '2/1/2000') - result = ts.resample('H', convention='s') + result = ts.resample('H', convention='s').asfreq() exp_rng = period_range('1/1/2000', '2/1/2000 23:00', freq='H') expected = ts.asfreq('H', how='s').reindex(exp_rng) assert_series_equal(result, expected) @@ -1292,7 +1783,7 @@ def test_upsample_daily_business_daily(self): def test_resample_empty(self): ts = _simple_pts('1/1/2000', '2/1/2000')[:0] - result = ts.resample('A') + result = ts.resample('A').asfreq() self.assertEqual(len(result), 0) def test_resample_irregular_sparse(self): @@ -1301,20 +1792,20 @@ def test_resample_irregular_sparse(self): # subset the data. subset = s[:'2012-01-04 06:55'] - result = subset.resample('10min', how=len) - expected = s.resample('10min', how=len).ix[result.index] + result = subset.resample('10min').apply(len) + expected = s.resample('10min').apply(len).ix[result.index] assert_series_equal(result, expected) def test_resample_weekly_all_na(self): rng = date_range('1/1/2000', periods=10, freq='W-WED') ts = Series(np.random.randn(len(rng)), index=rng) - result = ts.resample('W-THU') + result = ts.resample('W-THU').asfreq() self.assertTrue(result.isnull().all()) - result = ts.resample('W-THU', fill_method='ffill')[:-1] - expected = ts.asfreq('W-THU', method='ffill') + result = ts.resample('W-THU').asfreq().ffill()[:-1] + expected = ts.asfreq('W-THU').ffill() assert_series_equal(result, expected) def test_resample_tz_localized(self): @@ -1324,25 +1815,26 @@ def test_resample_tz_localized(self): ts_utc = ts.tz_localize('UTC') ts_local = ts_utc.tz_convert('America/Los_Angeles') - result = ts_local.resample('W') + result = ts_local.resample('W').mean() ts_local_naive = ts_local.copy() ts_local_naive.index = [x.replace(tzinfo=None) for x in ts_local_naive.index.to_pydatetime()] - exp = ts_local_naive.resample('W').tz_localize('America/Los_Angeles') + exp = ts_local_naive.resample( + 'W').mean().tz_localize('America/Los_Angeles') assert_series_equal(result, exp) # it works - result = ts_local.resample('D') + result = ts_local.resample('D').mean() # #2245 idx = date_range('2001-09-20 15:59', '2001-09-20 16:00', freq='T', tz='Australia/Sydney') s = Series([1, 2], index=idx) - result = s.resample('D', closed='right', label='right') + result = s.resample('D', closed='right', label='right').mean() ex_index = date_range('2001-09-21', periods=1, freq='D', tz='Australia/Sydney') expected = Series([1.5], index=ex_index) @@ -1350,7 +1842,7 @@ def test_resample_tz_localized(self): assert_series_equal(result, expected) # for good measure - result = s.resample('D', kind='period') + result = s.resample('D', kind='period').mean() ex_index = period_range('2001-09-20', periods=1, freq='D') expected = Series([1.5], index=ex_index) assert_series_equal(result, expected) @@ -1364,11 +1856,11 @@ def test_resample_tz_localized(self): ts['second'] = np.cumsum(np.random.randn(len(rng))) expected = DataFrame( { - 'first': ts.resample('A', how=np.sum)['first'], - 'second': ts.resample('A', how=np.mean)['second']}, + 'first': ts.resample('A').sum()['first'], + 'second': ts.resample('A').mean()['second']}, columns=['first', 'second']) result = ts.resample( - 'A', how={'first': np.sum, + 'A').agg({'first': np.sum, 'second': np.mean}).reindex(columns=['first', 'second']) assert_frame_equal(result, expected) @@ -1379,12 +1871,12 @@ def test_closed_left_corner(self): freq='1min', periods=21)) s[0] = np.nan - result = s.resample('10min', how='mean', closed='left', label='right') - exp = s[1:].resample('10min', how='mean', closed='left', label='right') + result = s.resample('10min', closed='left', label='right').mean() + exp = s[1:].resample('10min', closed='left', label='right').mean() assert_series_equal(result, exp) - result = s.resample('10min', how='mean', closed='left', label='left') - exp = s[1:].resample('10min', how='mean', closed='left', label='left') + result = s.resample('10min', closed='left', label='left').mean() + exp = s[1:].resample('10min', closed='left', label='left').mean() ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3) @@ -1395,8 +1887,8 @@ def test_quarterly_resampling(self): rng = period_range('2000Q1', periods=10, freq='Q-DEC') ts = Series(np.arange(10), index=rng) - result = ts.resample('A') - exp = ts.to_timestamp().resample('A').to_period() + result = ts.resample('A').mean() + exp = ts.to_timestamp().resample('A').mean().to_period() assert_series_equal(result, exp) def test_resample_weekly_bug_1726(self): @@ -1408,13 +1900,13 @@ def test_resample_weekly_bug_1726(self): index=ind) # it works! - df.resample('W-MON', how='first', closed='left', label='left') + df.resample('W-MON', closed='left', label='left').first() def test_resample_bms_2752(self): # GH2753 foo = pd.Series(index=pd.bdate_range('20000101', '20000201')) - res1 = foo.resample("BMS") - res2 = foo.resample("BMS").resample("B") + res1 = foo.resample("BMS").mean() + res2 = foo.resample("BMS").mean().resample("B").mean() self.assertEqual(res1.index[0], Timestamp('20000103')) self.assertEqual(res1.index[0], res2.index[0]) @@ -1427,7 +1919,7 @@ def test_resample_bms_2752(self): # expected = ts.asfreq('D', how='end').reindex(exp_index) # expected = expected.fillna(method='bfill') - # result = ts.resample('D', convention='span') + # result = ts.resample('D', convention='span').mean() # assert_series_equal(result, expected) @@ -1439,9 +1931,9 @@ def test_default_right_closed_label(self): idx = DatetimeIndex(start='8/15/2012', periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) - resampled = df.resample(to_freq) + resampled = df.resample(to_freq).mean() assert_frame_equal(resampled, df.resample(to_freq, closed='right', - label='right')) + label='right').mean()) def test_default_left_closed_label(self): others = ['MS', 'AS', 'QS', 'D', 'H'] @@ -1451,16 +1943,16 @@ def test_default_left_closed_label(self): idx = DatetimeIndex(start='8/15/2012', periods=100, freq=from_freq) df = DataFrame(np.random.randn(len(idx), 2), idx) - resampled = df.resample(to_freq) + resampled = df.resample(to_freq).mean() assert_frame_equal(resampled, df.resample(to_freq, closed='left', - label='left')) + label='left').mean()) def test_all_values_single_bin(self): # 2070 index = period_range(start="2012-01-01", end="2012-12-31", freq="M") s = Series(np.random.randn(len(index)), index=index) - result = s.resample("A", how='mean') + result = s.resample("A").mean() tm.assert_almost_equal(result[0], s.mean()) def test_evenly_divisible_with_no_extra_bins(self): @@ -1469,7 +1961,7 @@ def test_evenly_divisible_with_no_extra_bins(self): df = DataFrame(np.random.randn(9, 3), index=date_range('2000-1-1', periods=9)) - result = df.resample('5D') + result = df.resample('5D').mean() expected = pd.concat( [df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T expected.index = [Timestamp('2000-1-1'), Timestamp('2000-1-6')] @@ -1488,14 +1980,14 @@ def test_evenly_divisible_with_no_extra_bins(self): [{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14, 'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4, index=index) - result = df.resample('7D', how='count') + result = df.resample('7D').count() assert_frame_equal(result, expected) expected = DataFrame( [{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700, 'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4, index=index) - result = df.resample('7D', how='sum') + result = df.resample('7D').sum() assert_frame_equal(result, expected) @@ -1521,18 +2013,19 @@ def test_apply(self): def test_count(self): self.ts[::3] = np.nan - # TODO: unused? - grouper = TimeGrouper('A', label='right', closed='right') # noqa - - result = self.ts.resample('A', how='count') - expected = self.ts.groupby(lambda x: x.year).count() + + grouper = TimeGrouper('A', label='right', closed='right') + result = self.ts.groupby(grouper).count() expected.index = result.index + assert_series_equal(result, expected) + result = self.ts.resample('A').count() + expected.index = result.index assert_series_equal(result, expected) def test_numpy_reduction(self): - result = self.ts.resample('A', how='prod', closed='right') + result = self.ts.resample('A', closed='right').prod() expected = self.ts.groupby(lambda x: x.year).agg(np.prod) expected.index = result.index @@ -1575,9 +2068,8 @@ def f(x): tm.assert_panel_equal(result, binagg) def test_fails_on_no_datetime_index(self): - index_names = ('Int64Index', 'PeriodIndex', 'Index', 'Float64Index', - 'MultiIndex') - index_funcs = (tm.makeIntIndex, tm.makePeriodIndex, + index_names = ('Int64Index', 'Index', 'Float64Index', 'MultiIndex') + index_funcs = (tm.makeIntIndex, tm.makeUnicodeIndex, tm.makeFloatIndex, lambda m: tm.makeCustomIndex(m, 2)) n = 2 @@ -1585,10 +2077,18 @@ def test_fails_on_no_datetime_index(self): index = func(n) df = DataFrame({'a': np.random.randn(n)}, index=index) with tm.assertRaisesRegexp(TypeError, - "axis must be a DatetimeIndex, " + "Only valid with DatetimeIndex, " + "TimedeltaIndex or PeriodIndex, " "but got an instance of %r" % name): df.groupby(TimeGrouper('D')) + # PeriodIndex gives a specific error message + df = DataFrame({'a': np.random.randn(n)}, index=tm.makePeriodIndex(n)) + with tm.assertRaisesRegexp(TypeError, + "axis must be a DatetimeIndex, but " + "got an instance of 'PeriodIndex'"): + df.groupby(TimeGrouper('D')) + def test_aggregate_normal(self): # check TimeGrouper's aggregation is identical as normal groupby diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 99cada26464cb..9d80489904eb5 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2198,9 +2198,9 @@ def test_period_resample(self): s[10:30] = np.nan expected = Series([34.5, 79.5], index=[Period( '2013-01-01 00:00', 'T'), Period('2013-01-01 00:01', 'T')]) - result = s.to_period().resample('T', kind='period') + result = s.to_period().resample('T', kind='period').mean() assert_series_equal(result, expected) - result2 = s.resample('T', kind='period') + result2 = s.resample('T', kind='period').mean() assert_series_equal(result2, expected) def test_period_resample_with_local_timezone_pytz(self): @@ -2220,7 +2220,8 @@ def test_period_resample_with_local_timezone_pytz(self): series = pd.Series(1, index=index) series = series.tz_convert(local_timezone) - result = series.resample('D', kind='period') + result = series.resample('D', kind='period').mean() + # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific @@ -2245,7 +2246,8 @@ def test_period_resample_with_local_timezone_dateutil(self): series = pd.Series(1, index=index) series = series.tz_convert(local_timezone) - result = series.resample('D', kind='period') + result = series.resample('D', kind='period').mean() + # Create the expected series # Index is moved back a day with the timezone conversion from UTC to # Pacific diff --git a/pandas/util/testing.py b/pandas/util/testing.py index b78ba929463c9..aa5d698301da7 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -984,6 +984,7 @@ def assert_frame_equal(left, right, check_dtype=True, by_blocks=False, check_exact=False, check_datetimelike_compat=False, + check_like=False, obj='DataFrame'): """Check that left and right DataFrame are equal. @@ -1014,6 +1015,8 @@ def assert_frame_equal(left, right, check_dtype=True, Whether to compare number exactly. check_dateteimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. + check_like : bool, default False + If true, then reindex_like operands obj : str, default 'DataFrame' Specify object name being compared, internally used to show appropriate assertion message @@ -1026,16 +1029,24 @@ def assert_frame_equal(left, right, check_dtype=True, if check_frame_type: assertIsInstance(left, type(right)) + if check_like: + left, right = left.reindex_like(right), right + # shape comparison (row) if left.shape[0] != right.shape[0]: - raise_assert_detail(obj, 'DataFrame shape (number of rows) are different', + raise_assert_detail(obj, + 'DataFrame shape (number of rows) are different', '{0}, {1}'.format(left.shape[0], left.index), '{0}, {1}'.format(right.shape[0], right.index)) # shape comparison (columns) if left.shape[1] != right.shape[1]: - raise_assert_detail(obj, 'DataFrame shape (number of columns) are different', - '{0}, {1}'.format(left.shape[1], left.columns), - '{0}, {1}'.format(right.shape[1], right.columns)) + raise_assert_detail(obj, + 'DataFrame shape (number of columns) ' + 'are different', + '{0}, {1}'.format(left.shape[1], + left.columns), + '{0}, {1}'.format(right.shape[1], + right.columns)) # index comparison assert_index_equal(left.index, right.index, exact=check_index_type,