From ca7b6f2fba370bbdba1b43f1f3b0879ccbfe4934 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Wed, 26 Apr 2017 16:09:20 +0200 Subject: [PATCH 01/20] CLN: move PeriodIndex binning code to TimeGrouper Simplifies PeriodIndexResampler and makes it more similar to Resampler, DatetimeIndexResampler and TimedeltaIndexResampler. --- pandas/core/resample.py | 70 ++++++++++++++++------------------- pandas/tests/test_resample.py | 7 ---- 2 files changed, 32 insertions(+), 45 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 01c7e875b8ecc..98fcca4325f15 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -14,7 +14,7 @@ from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset, Tick, Day, _delta_to_nanoseconds -from pandas.core.indexes.period import PeriodIndex, period_range +from pandas.core.indexes.period import PeriodIndex import pandas.core.common as com import pandas.core.algorithms as algos from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -834,6 +834,11 @@ class PeriodIndexResampler(DatetimeIndexResampler): def _resampler_for_grouping(self): return PeriodIndexResamplerGroupby + def _get_binner_for_time(self): + if self.kind == 'timestamp': + return super(PeriodIndexResampler, self)._get_binner_for_time() + return self.groupby._get_period_bins(self.ax) + def _convert_obj(self, obj): obj = super(PeriodIndexResampler, self)._convert_obj(obj) @@ -858,29 +863,6 @@ def _convert_obj(self, obj): return obj - def aggregate(self, arg, *args, **kwargs): - result, how = self._aggregate(arg, *args, **kwargs) - if result is None: - result = self._downsample(arg, *args, **kwargs) - - result = self._apply_loffset(result) - return result - - agg = aggregate - - def _get_new_index(self): - """ return our new index """ - ax = self.ax - - if len(ax) == 0: - values = [] - else: - start = ax[0].asfreq(self.freq, how=self.convention) - end = ax[-1].asfreq(self.freq, how='end') - values = period_range(start, end, freq=self.freq).asi8 - - return ax._shallow_copy(values, freq=self.freq) - def _downsample(self, how, **kwargs): """ Downsample the cython defined function @@ -898,21 +880,9 @@ def _downsample(self, how, **kwargs): how = self._is_cython_func(how) or how ax = self.ax - new_index = self._get_new_index() - - # Start vs. end of period - memb = ax.asfreq(self.freq, how=self.convention) - if is_subperiod(ax.freq, self.freq): # Downsampling - if len(new_index) == 0: - bins = [] - else: - i8 = memb.asi8 - rng = np.arange(i8[0], i8[-1] + 1) - bins = memb.searchsorted(rng, side='right') - grouper = BinGrouper(bins, new_index) - return self._groupby_and_aggregate(how, grouper=grouper) + return self._groupby_and_aggregate(how, grouper=self.grouper) elif is_superperiod(ax.freq, self.freq): return self.asfreq() elif ax.freq == self.freq: @@ -946,9 +916,10 @@ def _upsample(self, method, limit=None, fill_value=None): return super(PeriodIndexResampler, self)._upsample( method, limit=limit, fill_value=fill_value) + self._set_binner() ax = self.ax obj = self.obj - new_index = self._get_new_index() + new_index = self.binner # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) @@ -1293,6 +1264,29 @@ def _get_time_period_bins(self, ax): return binner, bins, labels + def _get_period_bins(self, ax): + if not isinstance(ax, PeriodIndex): + raise TypeError('axis must be a PeriodIndex, but got ' + 'an instance of %r' % type(ax).__name__) + + if not len(ax): + binner = labels = PeriodIndex( + data=[], freq=self.freq, name=ax.name) + return binner, [], labels + + start = ax[0].asfreq(self.freq, how=self.convention) + end = ax[-1].asfreq(self.freq, how='end') + + labels = binner = PeriodIndex(start=start, end=end, + freq=self.freq, name=ax.name) + + memb = ax.asfreq(self.freq, how=self.convention) + i8 = memb.asi8 + rng = np.arange(i8[0], i8[-1] + 1) + bins = memb.searchsorted(rng, side='right') + + return binner, bins, labels + def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 7449beb8f97df..c182157c8afa7 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3167,13 +3167,6 @@ def test_fails_on_no_datetime_index(self): "instance of %r" % name): df.groupby(TimeGrouper('D')) - # PeriodIndex gives a specific error message - df = DataFrame({'a': np.random.randn(n)}, index=tm.makePeriodIndex(n)) - with tm.assert_raises_regex(TypeError, - "axis must be a DatetimeIndex, but " - "got an instance of 'PeriodIndex'"): - df.groupby(TimeGrouper('D')) - def test_aaa_group_order(self): # GH 12840 # check TimeGrouper perform stable sorts From c27f43048ba0c7ae7d1f0c4947d5ac253f28f628 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Wed, 26 Apr 2017 16:37:31 +0200 Subject: [PATCH 02/20] TST/CLN: raise error when resampling with on= or level= selection Added tests to cover all code paths, moved the check to _convert_obj() and removed then-redundant check from _upsample() method. --- pandas/core/resample.py | 22 +++++++++------------- pandas/tests/test_resample.py | 17 +++++++++++------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 98fcca4325f15..0fdbaefb8ea24 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -842,6 +842,13 @@ def _get_binner_for_time(self): def _convert_obj(self, obj): obj = super(PeriodIndexResampler, self)._convert_obj(obj) + if self._from_selection: + # see GH 14008, GH 12871 + msg = ("Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index") + raise NotImplementedError(msg) + offset = to_offset(self.freq) if offset.n > 1: if self.kind == 'period': # pragma: no cover @@ -852,14 +859,7 @@ def _convert_obj(self, obj): # convert to timestamp if not (self.kind is None or self.kind == 'period'): - if self._from_selection: - # see GH 14008, GH 12871 - msg = ("Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index") - raise NotImplementedError(msg) - else: - obj = obj.to_timestamp(how=self.convention) + obj = obj.to_timestamp(how=self.convention) return obj @@ -906,11 +906,7 @@ def _upsample(self, method, limit=None, fill_value=None): .fillna """ - if self._from_selection: - raise ValueError("Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like") + # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample( diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c182157c8afa7..5339badf8898e 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2294,12 +2294,17 @@ def test_selection(self): index=pd.MultiIndex.from_arrays([ np.arange(len(index), dtype=np.int64), index], names=['v', 'd'])) - - with pytest.raises(NotImplementedError): - df.resample('2D', on='date') - - with pytest.raises(NotImplementedError): - df.resample('2D', level='d') + for freq in ['H', '12H', '2D', 'W']: + # check up- and downsampling with base freqs and freq multiples + with pytest.raises(NotImplementedError): + df.resample(freq, on='date') + with pytest.raises(NotImplementedError): + df.resample(freq, level='d') + for kind_param in ['timestamp', 'period']: + with pytest.raises(NotImplementedError): + df.resample(freq, on='date', kind=kind_param) + with pytest.raises(NotImplementedError): + df.resample(freq, level='d', kind=kind_param) def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill') From 390e16e805e5bdeee7d01f3fb181b183fd30127b Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Wed, 26 Apr 2017 19:55:30 +0200 Subject: [PATCH 03/20] BUG: resampling PeriodIndex now returns PeriodIndex (GH 12884, 15944) Exceptions: - force conversion to DatetimeIndex by kind='timestamp' param - if loffset is given, convert to timestamps in any case --- pandas/core/resample.py | 19 ++--- pandas/tests/test_resample.py | 131 +++++++++++++++++++++++++--------- 2 files changed, 108 insertions(+), 42 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0fdbaefb8ea24..8840c16be41f0 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -849,16 +849,15 @@ def _convert_obj(self, obj): " use .set_index(...) to explicitly set index") raise NotImplementedError(msg) - offset = to_offset(self.freq) - if offset.n > 1: - if self.kind == 'period': # pragma: no cover - print('Warning: multiple of frequency -> timestamps') - - # Cannot have multiple of periods, convert to timestamp + if self.loffset is not None: + if self.kind == 'period': + print('Warning: loffset -> convert PeriodIndex to timestamps') + # Cannot apply loffset/timedelta to PeriodIndex -> convert to + # timestamps self.kind = 'timestamp' # convert to timestamp - if not (self.kind is None or self.kind == 'period'): + if self.kind == 'timestamp': obj = obj.to_timestamp(how=self.convention) return obj @@ -1278,8 +1277,10 @@ def _get_period_bins(self, ax): memb = ax.asfreq(self.freq, how=self.convention) i8 = memb.asi8 - rng = np.arange(i8[0], i8[-1] + 1) - bins = memb.searchsorted(rng, side='right') + freq_mult = self.freq.n + rng = np.arange(i8[0], i8[-1] + 1, freq_mult) + rng += freq_mult + bins = memb.searchsorted(rng, side='left') return binner, bins, labels diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 5339badf8898e..ab624fae1bc1c 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2221,49 +2221,75 @@ def create_series(self): return Series(np.arange(len(i)), index=i, name='pi') def test_asfreq_downsample(self): + # GH 12884, 15944 - # series s = self.create_series() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index = expected.index.to_timestamp() - expected.index.freq = to_offset('2D') + start = s.index[0].to_timestamp(how='start') + end = (s.index[-1] + 1).to_timestamp(how='start') - # this is a bug, this *should* return a PeriodIndex - # directly - # GH 12884 + new_index = date_range(start=start, end=end, freq='2D', closed='left') + # series + expected = s.to_timestamp().reindex(new_index).to_period('2D') result = s.resample('2D').asfreq() assert_series_equal(result, expected) + result_kind_period = s.resample('2D', kind='period').asfreq() + assert_series_equal(result_kind_period, expected) # frame frame = s.to_frame('value') - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index = expected.index.to_timestamp() - expected.index.freq = to_offset('2D') + expected = frame.to_timestamp().reindex(new_index).to_period('2D') result = frame.resample('2D').asfreq() assert_frame_equal(result, expected) + result_kind_period = frame.resample('2D', kind='period').asfreq() + assert_frame_equal(result_kind_period, expected) + + def test_asfreq_downsample_kind_timestamp(self): + # series + s = self.create_series() + expected = s.to_timestamp().resample('2D').asfreq() + result = s.resample('2D', kind='timestamp').asfreq() + assert_series_equal(result, expected) + + # frame + frame = s.to_frame('value') + expected = frame.to_timestamp().resample('2D').asfreq() + result = frame.resample('2D', kind='timestamp').asfreq() + assert_frame_equal(result, expected) def test_asfreq_upsample(self): + # GH 12884, 15944 - # this is a bug, this *should* return a PeriodIndex - # directly - # GH 12884 s = self.create_series() - new_index = date_range(s.index[0].to_timestamp(how='start'), - (s.index[-1] + 1).to_timestamp(how='start'), - freq='1H', - closed='left') - expected = s.to_timestamp().reindex(new_index).to_period() - result = s.resample('1H').asfreq() + start = s.index[0].to_timestamp(how='start') + end = (s.index[-1] + 1).to_timestamp(how='start') + for freq in ['1H', '2H']: + # check base frequency and frequency multiple + new_index = date_range(start=start, end=end, freq=freq, + closed='left') + # series + expected = s.to_timestamp().reindex(new_index).to_period(freq) + result = s.resample(freq).asfreq() + assert_series_equal(result, expected) + result_kind_period = s.resample(freq, kind='period').asfreq() + assert_series_equal(result_kind_period, expected) + + # frame + frame = s.to_frame('value') + expected = frame.to_timestamp().reindex(new_index).to_period(freq) + result = frame.resample(freq).asfreq() + assert_frame_equal(result, expected) + result_kind_period = frame.resample(freq, kind='period').asfreq() + assert_frame_equal(result_kind_period, expected) + + def test_asfreq_upsample_kind_timestamp(self): + s = self.create_series() + expected = s.to_timestamp().resample('1H').asfreq() + result = s.resample('1H', kind='timestamp').asfreq() assert_series_equal(result, expected) frame = s.to_frame('value') - new_index = date_range(frame.index[0].to_timestamp(how='start'), - (frame.index[-1] + 1).to_timestamp(how='start'), - freq='1H', - closed='left') - expected = frame.to_timestamp().reindex(new_index).to_period() - result = frame.resample('1H').asfreq() + expected = frame.to_timestamp().resample('1H').asfreq() + result = frame.resample('1H', kind='timestamp').asfreq() assert_frame_equal(result, expected) def test_asfreq_fill_value(self): @@ -2375,12 +2401,11 @@ def test_basic_upsample(self): ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') result = ts.resample('a-dec').mean() - resampled = result.resample('D', convention='end').ffill() - - expected = result.to_timestamp('D', how='end') - expected = expected.asfreq('D', 'ffill').to_period() - - assert_series_equal(resampled, expected) + for freq in ['D', '2D']: + resampled = result.resample(freq, convention='end').ffill() + expected = result.to_timestamp(freq, how='end') + expected = expected.asfreq(freq, 'ffill').to_period(freq) + assert_series_equal(resampled, expected) def test_upsample_with_limit(self): rng = period_range('1/1/2000', periods=5, freq='A') @@ -2451,10 +2476,13 @@ def test_resample_count(self): series = pd.Series(1, index=pd.period_range(start='2000', periods=100)) result = series.resample('M').count() - expected_index = pd.period_range(start='2000', freq='M', periods=4) expected = pd.Series([31, 29, 31, 9], index=expected_index) + assert_series_equal(result, expected) + result = series.resample('2M').count() + expected_index = pd.period_range(start='2000', freq='2M', periods=2) + expected = pd.Series([31 + 29, 31 + 9], index=expected_index) assert_series_equal(result, expected) def test_resample_same_freq(self): @@ -2596,7 +2624,17 @@ def test_resample_5minute(self): rng = period_range('1/1/2000', '1/5/2000', freq='T') ts = Series(np.random.randn(len(rng)), index=rng) + expected = ts.to_timestamp().resample('5min').mean().to_period('5min') result = ts.resample('5min').mean() + assert_series_equal(result, expected) + result_kind_period = ts.resample('5min', kind='period').mean() + assert_series_equal(result_kind_period, expected) + + def test_resample_5minute_kind_timestamp(self): + rng = period_range('1/1/2000', '1/5/2000', freq='T') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('5min', kind='timestamp').mean() expected = ts.to_timestamp().resample('5min').mean() assert_series_equal(result, expected) @@ -2824,6 +2862,33 @@ def test_apply_to_empty_series(self): for freq in ['M', 'D', 'H']: with pytest.raises(TypeError): series.resample(freq).apply(lambda x: 1) + def test_loffset_returns_datetimeindex(self): + # make sure passing loffset returns DatetimeIndex in all cases + # basic method taken from Base.test_resample_loffset_arg_type() + df = self.create_series().to_frame('value') + expected_means = [df.values[i:i + 2].mean() + for i in range(0, len(df.values), 2)] + expected_index = self.create_index(df.index[0], periods=len(df.index) / + 2, freq='2D') + + # loffset coreces PeriodIndex to DateTimeIndex + expected_index = expected_index.to_timestamp() + expected_index += timedelta(hours=2) + expected = DataFrame({'value': expected_means}, index=expected_index) + + for arg in ['mean', {'value': 'mean'}, ['mean']]: + for kind_param in [None, 'period', 'timestamp']: + result_agg = (df.resample('2D', loffset='2H', kind=kind_param) + .agg(arg)) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result_how = df.resample('2D', how=arg, loffset='2H', + kind=kind_param) + if isinstance(arg, list): + expected.columns = (pd.MultiIndex + .from_tuples([('value', 'mean')])) + assert_frame_equal(result_agg, expected) + assert_frame_equal(result_how, expected) class TestTimedeltaIndex(Base): From 23566c2de40eb0215d2eb631a8e20b93de197424 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Wed, 26 Apr 2017 20:31:10 +0200 Subject: [PATCH 04/20] BUG: OHLC-upsampling of PeriodIndex now returns DataFrame (GH 13083) --- pandas/core/resample.py | 11 ++++++++++- pandas/tests/test_resample.py | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8840c16be41f0..00e2a81693499 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -883,6 +883,12 @@ def _downsample(self, how, **kwargs): # Downsampling return self._groupby_and_aggregate(how, grouper=self.grouper) elif is_superperiod(ax.freq, self.freq): + if how == 'ohlc': + # upsampling to subperiods is handled as an asfreq, which works + # for pure aggregating/reducing methods + # OHLC reduces along the time dimension, but creates multiple + # values for each period -> handle by _groupby_and_aggregate() + return self._groupby_and_aggregate(how, grouper=self.grouper) return self.asfreq() elif ax.freq == self.freq: return self.asfreq() @@ -1278,7 +1284,10 @@ def _get_period_bins(self, ax): memb = ax.asfreq(self.freq, how=self.convention) i8 = memb.asi8 freq_mult = self.freq.n - rng = np.arange(i8[0], i8[-1] + 1, freq_mult) + # when upsampling to subperiods, we need to generate enough bins + expected_bins_count = len(binner) * freq_mult + i8_extend = expected_bins_count - (i8[-1] - i8[0]) + rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult) rng += freq_mult bins = memb.searchsorted(rng, side='left') diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index ab624fae1bc1c..23c1f46ea46df 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2890,6 +2890,29 @@ def test_loffset_returns_datetimeindex(self): assert_frame_equal(result_agg, expected) assert_frame_equal(result_how, expected) + def test_upsampling_ohlc(self): + # GH 13083 + pi = PeriodIndex(start='2000', freq='D', periods=10) + s = Series(range(len(pi)), index=pi) + expected = s.to_timestamp().resample('H').ohlc().to_period() + # timestamp-based resampling doesn't include all sub-periods + # of the last original period, so extend accordingly: + pi_ext = PeriodIndex(start='2000', freq='H', periods=24 * len(pi)) + expected = expected.reindex(pi_ext) + result = s.resample('H').ohlc() + assert_frame_equal(result, expected) + + def test_upsampling_ohlc_freq_multiples(self): + pi = PeriodIndex(start='2000', freq='D', periods=10) + s = pd.Series(range(len(pi)), index=pi) + expected = s.to_timestamp().resample('12H').ohlc().to_period('12H') + # timestamp-based resampling doesn't include all sub-periods + # of the last original period, so extend accordingly: + pi_ext = PeriodIndex(start='2000', freq='12H', periods=2 * len(pi)) + expected = expected.reindex(pi_ext) + result = s.resample('12H', kind='period').ohlc() + assert_frame_equal(result, expected) + class TestTimedeltaIndex(Base): _index_factory = lambda x: timedelta_range From a82879d74f407bb605550e77244947b7588cde5b Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Wed, 26 Apr 2017 20:44:49 +0200 Subject: [PATCH 05/20] BUG: enable resampling with NaT in PeriodIndex (GH 13224) --- pandas/core/resample.py | 32 ++++++++++++++++++++++++---- pandas/tests/test_resample.py | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 00e2a81693499..f85c12f626fe2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1270,18 +1270,34 @@ def _get_period_bins(self, ax): raise TypeError('axis must be a PeriodIndex, but got ' 'an instance of %r' % type(ax).__name__) - if not len(ax): + memb = ax.asfreq(self.freq, how=self.convention) + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + nat_count = 0 + if memb.hasnans: + import warnings + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', 'numpy equal will not check ' + 'object identity') + nat_mask = memb.base == tslib.NaT + # raises "FutureWarning: numpy equal will not check object + # identity in the future. The comparison did not return the + # same result as suggested by the identity (`is`)) and will + # change." + nat_count = np.sum(nat_mask) + memb = memb[~nat_mask] + + # if index contains no valid (non-NaT) values, return empty index + if not len(memb): binner = labels = PeriodIndex( data=[], freq=self.freq, name=ax.name) return binner, [], labels - start = ax[0].asfreq(self.freq, how=self.convention) - end = ax[-1].asfreq(self.freq, how='end') + start = ax.min().asfreq(self.freq, how=self.convention) + end = ax.max().asfreq(self.freq, how='end') labels = binner = PeriodIndex(start=start, end=end, freq=self.freq, name=ax.name) - memb = ax.asfreq(self.freq, how=self.convention) i8 = memb.asi8 freq_mult = self.freq.n # when upsampling to subperiods, we need to generate enough bins @@ -1291,6 +1307,14 @@ def _get_period_bins(self, ax): rng += freq_mult bins = memb.searchsorted(rng, side='left') + if nat_count > 0: + # NaT handling as in pandas._lib.lib.generate_bins_dt64() + # shift bins by the number of NaT + bins += nat_count + bins = np.insert(bins, 0, nat_count) + binner = binner.insert(0, tslib.NaT) + labels = labels.insert(0, tslib.NaT) + return binner, bins, labels diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 23c1f46ea46df..0ee4f3b64159e 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2913,6 +2913,45 @@ def test_upsampling_ohlc_freq_multiples(self): result = s.resample('12H', kind='period').ohlc() assert_frame_equal(result, expected) + def test_resample_with_nat(self): + # GH 13224 + index = PeriodIndex([pd.NaT, '1970-01-01 00:00:00', pd.NaT, + '1970-01-01 00:00:01', '1970-01-01 00:00:02'], + freq='S') + frame = DataFrame([2, 3, 5, 7, 11], index=index) + + index_1s = PeriodIndex(['1970-01-01 00:00:00', '1970-01-01 00:00:01', + '1970-01-01 00:00:02'], freq='S') + frame_1s = DataFrame([3, 7, 11], index=index_1s) + result_1s = frame.resample('1s').mean() + assert_frame_equal(result_1s, frame_1s) + + index_2s = PeriodIndex(['1970-01-01 00:00:00', + '1970-01-01 00:00:02'], freq='2S') + frame_2s = DataFrame([5, 11], index=index_2s) + result_2s = frame.resample('2s').mean() + assert_frame_equal(result_2s, frame_2s) + + index_3s = PeriodIndex(['1970-01-01 00:00:00'], freq='3S') + frame_3s = DataFrame([7], index=index_3s) + result_3s = frame.resample('3s').mean() + assert_frame_equal(result_3s, frame_3s) + + pi = PeriodIndex(['1970-01-01 00:00:00', pd.NaT, + '1970-01-01 00:00:02'], freq='S') + frame = DataFrame([2, 3, 5], index=pi) + expected_index = period_range(pi[0], periods=len(pi), freq=pi.freq) + expected = DataFrame([2, np.NaN, 5], index=expected_index) + result = frame.resample('1s').mean() + assert_frame_equal(result, expected) + + pi = PeriodIndex([pd.NaT] * 3, freq='S') + frame = DataFrame([2, 3, 5], index=pi) + expected_index = PeriodIndex(data=[], freq=pi.freq) + expected = DataFrame([], index=expected_index) + result = frame.resample('1s').mean() + assert_frame_equal(result, expected) + class TestTimedeltaIndex(Base): _index_factory = lambda x: timedelta_range From 4b1c740e67ec1508eae15ebfcef30a49663feb03 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sun, 30 Apr 2017 20:41:11 +0200 Subject: [PATCH 06/20] CLN: remove warning on falling back to tstamp resampling with loffset --- pandas/core/resample.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f85c12f626fe2..d2884924abbeb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -850,10 +850,8 @@ def _convert_obj(self, obj): raise NotImplementedError(msg) if self.loffset is not None: - if self.kind == 'period': - print('Warning: loffset -> convert PeriodIndex to timestamps') - # Cannot apply loffset/timedelta to PeriodIndex -> convert to - # timestamps + # Cannot apply loffset/timedelta to PeriodIndex -> convert to + # timestamps self.kind = 'timestamp' # convert to timestamp From 73c09902461f981d60635f71f12ff5ddcf7ead51 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Mon, 1 May 2017 20:36:55 +0200 Subject: [PATCH 07/20] CLN: use memb._isnan for NaT masking --- pandas/core/resample.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d2884924abbeb..d501819259e69 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1272,17 +1272,8 @@ def _get_period_bins(self, ax): # NaT handling as in pandas._lib.lib.generate_bins_dt64() nat_count = 0 if memb.hasnans: - import warnings - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'numpy equal will not check ' - 'object identity') - nat_mask = memb.base == tslib.NaT - # raises "FutureWarning: numpy equal will not check object - # identity in the future. The comparison did not return the - # same result as suggested by the identity (`is`)) and will - # change." - nat_count = np.sum(nat_mask) - memb = memb[~nat_mask] + nat_count = np.sum(memb._isnan) + memb = memb[~memb._isnan] # if index contains no valid (non-NaT) values, return empty index if not len(memb): From fa6c1d3586f7cdb5379f1d29d07603944b8391b7 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Mon, 1 May 2017 20:40:21 +0200 Subject: [PATCH 08/20] DOC: added issue reference for OHLC resampling --- pandas/core/resample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index d501819259e69..715c605e6efeb 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -882,6 +882,7 @@ def _downsample(self, how, **kwargs): return self._groupby_and_aggregate(how, grouper=self.grouper) elif is_superperiod(ax.freq, self.freq): if how == 'ohlc': + # GH #13083 # upsampling to subperiods is handled as an asfreq, which works # for pure aggregating/reducing methods # OHLC reduces along the time dimension, but creates multiple From 7ea04e9a6d296d2a33da99b1fdc6aa387df5ddde Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Mon, 1 May 2017 20:44:56 +0200 Subject: [PATCH 09/20] STYLE: added blank lines --- pandas/core/resample.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 715c605e6efeb..083fbcaaabe46 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1270,6 +1270,7 @@ def _get_period_bins(self, ax): 'an instance of %r' % type(ax).__name__) memb = ax.asfreq(self.freq, how=self.convention) + # NaT handling as in pandas._lib.lib.generate_bins_dt64() nat_count = 0 if memb.hasnans: @@ -1290,6 +1291,7 @@ def _get_period_bins(self, ax): i8 = memb.asi8 freq_mult = self.freq.n + # when upsampling to subperiods, we need to generate enough bins expected_bins_count = len(binner) * freq_mult i8_extend = expected_bins_count - (i8[-1] - i8[0]) From 82a82756d9f84307927dcaf66c87f499e9ff9b81 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 6 May 2017 08:53:12 +0200 Subject: [PATCH 10/20] TST: convert to parametrized tests / pytest idiom --- pandas/tests/test_resample.py | 298 +++++++++++++++------------------- 1 file changed, 127 insertions(+), 171 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 0ee4f3b64159e..2d2cf12455ba1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -53,6 +53,13 @@ def _simple_pts(start, end, freq='D'): return Series(np.random.randn(len(rng)), index=rng) +def assert_series_or_frame_equal(result, expected): + if isinstance(result, Series): + return assert_series_equal(result, expected) + elif isinstance(result, DataFrame): + return assert_frame_equal(result, expected) + + class TestResampleAPI(object): def setup_method(self, method): @@ -698,6 +705,32 @@ def create_index(self, *args, **kwargs): factory = self._index_factory() return factory(*args, **kwargs) + _index_fixture_start = datetime(2005, 1, 1) + _index_fixture_end = datetime(2005, 1, 10) + _index_fixture_freq = 'D' + + @pytest.fixture(scope='class') + def index(self): + return self.create_index(self._index_fixture_start, + self._index_fixture_end, + freq=self._index_fixture_freq) + + @pytest.fixture(scope='class') + def series(self, index): + return Series(np.arange(len(index)), index=index, + name=self._series_fixture_name) + + @pytest.fixture(scope='class') + def frame(self, index): + return DataFrame({'value': np.arange(len(index))}, index=index) + + @pytest.fixture(params=[Series, DataFrame], scope='class') + def series_and_frame(self, request, index): + if request.param == Series: + return self.series(index) + if request.param == DataFrame: + return self.frame(index) + def test_asfreq_downsample(self): s = self.create_series() @@ -865,6 +898,7 @@ def test_apply_to_empty_series(self): class TestDatetimeIndex(Base): _index_factory = lambda x: date_range + _series_fixture_name = 'dti' def setup_method(self, method): dti = DatetimeIndex(start=datetime(2005, 1, 1), @@ -2213,84 +2247,47 @@ def test_resample_datetime_values(self): class TestPeriodIndex(Base): _index_factory = lambda x: period_range + _series_fixture_name = 'pi' def create_series(self): + # TODO: replace calls to .create_series() by injecting the series + # fixture i = period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq='D') return Series(np.arange(len(i)), index=i, name='pi') - def test_asfreq_downsample(self): + @pytest.mark.parametrize('freq', ['2D']) + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + def test_asfreq_downsample(self, series_and_frame, freq, kind): # GH 12884, 15944 + obj = series_and_frame + start = obj.index[0].to_timestamp(how='start') + end = (obj.index[-1] + 1).to_timestamp(how='start') + if kind == 'timestamp': + expected = obj.to_timestamp().resample(freq).asfreq() + else: + new_index = date_range(start=start, end=end, freq=freq, + closed='left') + expected = obj.to_timestamp().reindex(new_index).to_period(freq) + result = obj.resample(freq, kind=kind).asfreq() + assert_series_or_frame_equal(result, expected) - s = self.create_series() - start = s.index[0].to_timestamp(how='start') - end = (s.index[-1] + 1).to_timestamp(how='start') - - new_index = date_range(start=start, end=end, freq='2D', closed='left') - # series - expected = s.to_timestamp().reindex(new_index).to_period('2D') - result = s.resample('2D').asfreq() - assert_series_equal(result, expected) - result_kind_period = s.resample('2D', kind='period').asfreq() - assert_series_equal(result_kind_period, expected) - - # frame - frame = s.to_frame('value') - expected = frame.to_timestamp().reindex(new_index).to_period('2D') - result = frame.resample('2D').asfreq() - assert_frame_equal(result, expected) - result_kind_period = frame.resample('2D', kind='period').asfreq() - assert_frame_equal(result_kind_period, expected) - - def test_asfreq_downsample_kind_timestamp(self): - # series - s = self.create_series() - expected = s.to_timestamp().resample('2D').asfreq() - result = s.resample('2D', kind='timestamp').asfreq() - assert_series_equal(result, expected) - - # frame - frame = s.to_frame('value') - expected = frame.to_timestamp().resample('2D').asfreq() - result = frame.resample('2D', kind='timestamp').asfreq() - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): + @pytest.mark.parametrize('freq', ['1H', '2H']) + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + def test_asfreq_upsample(self, series_and_frame, freq, kind): # GH 12884, 15944 - - s = self.create_series() - start = s.index[0].to_timestamp(how='start') - end = (s.index[-1] + 1).to_timestamp(how='start') - for freq in ['1H', '2H']: - # check base frequency and frequency multiple + obj = series_and_frame + start = obj.index[0].to_timestamp(how='start') + end = (obj.index[-1] + 1).to_timestamp(how='start') + if kind == 'timestamp': + expected = obj.to_timestamp().resample(freq).asfreq() + else: new_index = date_range(start=start, end=end, freq=freq, closed='left') - # series - expected = s.to_timestamp().reindex(new_index).to_period(freq) - result = s.resample(freq).asfreq() - assert_series_equal(result, expected) - result_kind_period = s.resample(freq, kind='period').asfreq() - assert_series_equal(result_kind_period, expected) - - # frame - frame = s.to_frame('value') - expected = frame.to_timestamp().reindex(new_index).to_period(freq) - result = frame.resample(freq).asfreq() - assert_frame_equal(result, expected) - result_kind_period = frame.resample(freq, kind='period').asfreq() - assert_frame_equal(result_kind_period, expected) - - def test_asfreq_upsample_kind_timestamp(self): - s = self.create_series() - expected = s.to_timestamp().resample('1H').asfreq() - result = s.resample('1H', kind='timestamp').asfreq() - assert_series_equal(result, expected) - - frame = s.to_frame('value') - expected = frame.to_timestamp().resample('1H').asfreq() - result = frame.resample('1H', kind='timestamp').asfreq() - assert_frame_equal(result, expected) + expected = obj.to_timestamp().reindex(new_index).to_period(freq) + result = obj.resample(freq, kind=kind).asfreq() + assert_series_or_frame_equal(result, expected) def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 @@ -2311,8 +2308,9 @@ def test_asfreq_fill_value(self): result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0) assert_frame_equal(result, expected) - def test_selection(self): - index = self.create_series().index + @pytest.mark.parametrize('freq', ['H', '12H', '2D', 'W']) + @pytest.mark.parametrize('kind', [None, 'period', 'timestamp']) + def test_selection(self, index, freq, kind): # This is a bug, these should be implemented # GH 14008 df = pd.DataFrame({'date': index, @@ -2320,17 +2318,10 @@ def test_selection(self): index=pd.MultiIndex.from_arrays([ np.arange(len(index), dtype=np.int64), index], names=['v', 'd'])) - for freq in ['H', '12H', '2D', 'W']: - # check up- and downsampling with base freqs and freq multiples - with pytest.raises(NotImplementedError): - df.resample(freq, on='date') - with pytest.raises(NotImplementedError): - df.resample(freq, level='d') - for kind_param in ['timestamp', 'period']: - with pytest.raises(NotImplementedError): - df.resample(freq, on='date', kind=kind_param) - with pytest.raises(NotImplementedError): - df.resample(freq, level='d', kind=kind_param) + with pytest.raises(NotImplementedError): + df.resample(freq, on='date', kind=kind) + with pytest.raises(NotImplementedError): + df.resample(freq, level='d', kind=kind) def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill') @@ -2397,15 +2388,15 @@ def test_not_subperiod(self): pytest.raises(ValueError, lambda: ts.resample('M').mean()) pytest.raises(ValueError, lambda: ts.resample('w-thu').mean()) - def test_basic_upsample(self): + @pytest.mark.parametrize('freq', ['D', '2D']) + def test_basic_upsample(self, freq): ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') result = ts.resample('a-dec').mean() - for freq in ['D', '2D']: - resampled = result.resample(freq, convention='end').ffill() - expected = result.to_timestamp(freq, how='end') - expected = expected.asfreq(freq, 'ffill').to_period(freq) - assert_series_equal(resampled, expected) + resampled = result.resample(freq, convention='end').ffill() + expected = result.to_timestamp(freq, how='end') + expected = expected.asfreq(freq, 'ffill').to_period(freq) + assert_series_equal(resampled, expected) def test_upsample_with_limit(self): rng = period_range('1/1/2000', periods=5, freq='A') @@ -2470,19 +2461,15 @@ def test_resample_basic(self): result2 = s.resample('T', kind='period').mean() assert_series_equal(result2, expected) - def test_resample_count(self): - + @pytest.mark.parametrize('freq,expected_vals', [('M', [31, 29, 31, 9]), + ('2M', [31 + 29, 31 + 9])]) + def test_resample_count(self, freq, expected_vals): # GH12774 - series = pd.Series(1, index=pd.period_range(start='2000', - periods=100)) - result = series.resample('M').count() - expected_index = pd.period_range(start='2000', freq='M', periods=4) - expected = pd.Series([31, 29, 31, 9], index=expected_index) - assert_series_equal(result, expected) - - result = series.resample('2M').count() - expected_index = pd.period_range(start='2000', freq='2M', periods=2) - expected = pd.Series([31 + 29, 31 + 9], index=expected_index) + series = pd.Series(1, index=pd.period_range(start='2000', periods=100)) + result = series.resample(freq).count() + expected_index = pd.period_range(start='2000', freq=freq, + periods=len(expected_vals)) + expected = pd.Series(expected_vals, index=expected_index) assert_series_equal(result, expected) def test_resample_same_freq(self): @@ -2620,22 +2607,15 @@ def test_cant_fill_missing_dups(self): s = Series(np.random.randn(5), index=rng) pytest.raises(Exception, lambda: s.resample('A').ffill()) - def test_resample_5minute(self): - rng = period_range('1/1/2000', '1/5/2000', freq='T') - ts = Series(np.random.randn(len(rng)), index=rng) - - expected = ts.to_timestamp().resample('5min').mean().to_period('5min') - result = ts.resample('5min').mean() - assert_series_equal(result, expected) - result_kind_period = ts.resample('5min', kind='period').mean() - assert_series_equal(result_kind_period, expected) - - def test_resample_5minute_kind_timestamp(self): + @pytest.mark.parametrize('freq', ['5min']) + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + def test_resample_5minute(self, freq, kind): rng = period_range('1/1/2000', '1/5/2000', freq='T') ts = Series(np.random.randn(len(rng)), index=rng) - - result = ts.resample('5min', kind='timestamp').mean() - expected = ts.to_timestamp().resample('5min').mean() + expected = ts.to_timestamp().resample(freq).mean() + if kind != 'timestamp': + expected = expected.to_period(freq) + result = ts.resample(freq, kind=kind).mean() assert_series_equal(result, expected) def test_upsample_daily_business_daily(self): @@ -2862,89 +2842,65 @@ def test_apply_to_empty_series(self): for freq in ['M', 'D', 'H']: with pytest.raises(TypeError): series.resample(freq).apply(lambda x: 1) - def test_loffset_returns_datetimeindex(self): + + @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) + @pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']]) + def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): # make sure passing loffset returns DatetimeIndex in all cases # basic method taken from Base.test_resample_loffset_arg_type() - df = self.create_series().to_frame('value') + df = frame expected_means = [df.values[i:i + 2].mean() for i in range(0, len(df.values), 2)] - expected_index = self.create_index(df.index[0], periods=len(df.index) / - 2, freq='2D') - + expected_index = self.create_index(df.index[0], + periods=len(df.index) / 2, + freq='2D') # loffset coreces PeriodIndex to DateTimeIndex expected_index = expected_index.to_timestamp() expected_index += timedelta(hours=2) expected = DataFrame({'value': expected_means}, index=expected_index) - for arg in ['mean', {'value': 'mean'}, ['mean']]: - for kind_param in [None, 'period', 'timestamp']: - result_agg = (df.resample('2D', loffset='2H', kind=kind_param) - .agg(arg)) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result_how = df.resample('2D', how=arg, loffset='2H', - kind=kind_param) - if isinstance(arg, list): - expected.columns = (pd.MultiIndex - .from_tuples([('value', 'mean')])) - assert_frame_equal(result_agg, expected) - assert_frame_equal(result_how, expected) - - def test_upsampling_ohlc(self): + result_agg = df.resample('2D', loffset='2H', kind=kind).agg(agg_arg) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result_how = df.resample('2D', how=agg_arg, loffset='2H', + kind=kind) + if isinstance(agg_arg, list): + expected.columns = pd.MultiIndex .from_tuples([('value', 'mean')]) + assert_frame_equal(result_agg, expected) + assert_frame_equal(result_how, expected) + + @pytest.mark.parametrize('freq, period_mult', [('H', 24), ('12H', 2)]) + @pytest.mark.parametrize('kind', [None, 'period']) + def test_upsampling_ohlc(self, freq, period_mult, kind): # GH 13083 pi = PeriodIndex(start='2000', freq='D', periods=10) s = Series(range(len(pi)), index=pi) - expected = s.to_timestamp().resample('H').ohlc().to_period() - # timestamp-based resampling doesn't include all sub-periods - # of the last original period, so extend accordingly: - pi_ext = PeriodIndex(start='2000', freq='H', periods=24 * len(pi)) - expected = expected.reindex(pi_ext) - result = s.resample('H').ohlc() - assert_frame_equal(result, expected) - - def test_upsampling_ohlc_freq_multiples(self): - pi = PeriodIndex(start='2000', freq='D', periods=10) - s = pd.Series(range(len(pi)), index=pi) - expected = s.to_timestamp().resample('12H').ohlc().to_period('12H') + expected = s.to_timestamp().resample(freq).ohlc().to_period(freq) # timestamp-based resampling doesn't include all sub-periods # of the last original period, so extend accordingly: - pi_ext = PeriodIndex(start='2000', freq='12H', periods=2 * len(pi)) - expected = expected.reindex(pi_ext) - result = s.resample('12H', kind='period').ohlc() + new_index = PeriodIndex(start='2000', freq=freq, + periods=period_mult * len(pi)) + expected = expected.reindex(new_index) + result = s.resample(freq, kind=kind).ohlc() assert_frame_equal(result, expected) - def test_resample_with_nat(self): + @pytest.mark.parametrize('freq, expected_values', + [('1s', [3, np.NaN, 7, 11]), + ('2s', [3, int((7 + 11) / 2)]), + ('3s', [int((3 + 7) / 2), 11])]) + def test_resample_with_nat(self, freq, expected_values): # GH 13224 index = PeriodIndex([pd.NaT, '1970-01-01 00:00:00', pd.NaT, - '1970-01-01 00:00:01', '1970-01-01 00:00:02'], + '1970-01-01 00:00:02', '1970-01-01 00:00:03'], freq='S') frame = DataFrame([2, 3, 5, 7, 11], index=index) - index_1s = PeriodIndex(['1970-01-01 00:00:00', '1970-01-01 00:00:01', - '1970-01-01 00:00:02'], freq='S') - frame_1s = DataFrame([3, 7, 11], index=index_1s) - result_1s = frame.resample('1s').mean() - assert_frame_equal(result_1s, frame_1s) - - index_2s = PeriodIndex(['1970-01-01 00:00:00', - '1970-01-01 00:00:02'], freq='2S') - frame_2s = DataFrame([5, 11], index=index_2s) - result_2s = frame.resample('2s').mean() - assert_frame_equal(result_2s, frame_2s) - - index_3s = PeriodIndex(['1970-01-01 00:00:00'], freq='3S') - frame_3s = DataFrame([7], index=index_3s) - result_3s = frame.resample('3s').mean() - assert_frame_equal(result_3s, frame_3s) - - pi = PeriodIndex(['1970-01-01 00:00:00', pd.NaT, - '1970-01-01 00:00:02'], freq='S') - frame = DataFrame([2, 3, 5], index=pi) - expected_index = period_range(pi[0], periods=len(pi), freq=pi.freq) - expected = DataFrame([2, np.NaN, 5], index=expected_index) - result = frame.resample('1s').mean() + expected_index = period_range('1970-01-01 00:00:00', + periods=len(expected_values), freq=freq) + expected = DataFrame(expected_values, index=expected_index) + result = frame.resample(freq).mean() assert_frame_equal(result, expected) + def test_resample_with_only_nat(self): pi = PeriodIndex([pd.NaT] * 3, freq='S') frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) From 432c623d1598c02950c6a8939a21478dc2927ed7 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 6 May 2017 09:26:39 +0200 Subject: [PATCH 11/20] CLN/TST: call assert_almost_equal() when comparing Series/DataFrames Removed previously defined helper method assert_series_or_frame_equal(), which behaved identical to assert_almost_equal() --- pandas/tests/test_resample.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 2d2cf12455ba1..6e40b6921c3fe 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -53,13 +53,6 @@ def _simple_pts(start, end, freq='D'): return Series(np.random.randn(len(rng)), index=rng) -def assert_series_or_frame_equal(result, expected): - if isinstance(result, Series): - return assert_series_equal(result, expected) - elif isinstance(result, DataFrame): - return assert_frame_equal(result, expected) - - class TestResampleAPI(object): def setup_method(self, method): @@ -2271,7 +2264,7 @@ def test_asfreq_downsample(self, series_and_frame, freq, kind): closed='left') expected = obj.to_timestamp().reindex(new_index).to_period(freq) result = obj.resample(freq, kind=kind).asfreq() - assert_series_or_frame_equal(result, expected) + assert_almost_equal(result, expected) @pytest.mark.parametrize('freq', ['1H', '2H']) @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) @@ -2287,7 +2280,7 @@ def test_asfreq_upsample(self, series_and_frame, freq, kind): closed='left') expected = obj.to_timestamp().reindex(new_index).to_period(freq) result = obj.resample(freq, kind=kind).asfreq() - assert_series_or_frame_equal(result, expected) + assert_almost_equal(result, expected) def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 From c8814fbd3d87ab78d3e77d196e9eb9df0da05672 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 13 May 2017 10:36:21 +0200 Subject: [PATCH 12/20] STYLE: added blank lines, removed odd whitespace, fixed typo --- pandas/tests/test_resample.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 6e40b6921c3fe..bba807c8daf7c 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -850,7 +850,7 @@ def test_resample_loffset_arg_type(self): periods=len(df.index) / 2, freq='2D') - # loffset coreces PeriodIndex to DateTimeIndex + # loffset coerces PeriodIndex to DateTimeIndex if isinstance(expected_index, PeriodIndex): expected_index = expected_index.to_timestamp() @@ -2847,7 +2847,8 @@ def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): expected_index = self.create_index(df.index[0], periods=len(df.index) / 2, freq='2D') - # loffset coreces PeriodIndex to DateTimeIndex + + # loffset coerces PeriodIndex to DateTimeIndex expected_index = expected_index.to_timestamp() expected_index += timedelta(hours=2) expected = DataFrame({'value': expected_means}, index=expected_index) @@ -2857,7 +2858,7 @@ def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): result_how = df.resample('2D', how=agg_arg, loffset='2H', kind=kind) if isinstance(agg_arg, list): - expected.columns = pd.MultiIndex .from_tuples([('value', 'mean')]) + expected.columns = pd.MultiIndex.from_tuples([('value', 'mean')]) assert_frame_equal(result_agg, expected) assert_frame_equal(result_how, expected) @@ -2868,6 +2869,7 @@ def test_upsampling_ohlc(self, freq, period_mult, kind): pi = PeriodIndex(start='2000', freq='D', periods=10) s = Series(range(len(pi)), index=pi) expected = s.to_timestamp().resample(freq).ohlc().to_period(freq) + # timestamp-based resampling doesn't include all sub-periods # of the last original period, so extend accordingly: new_index = PeriodIndex(start='2000', freq=freq, From 486ad6705078d12d8d5685157127432f65877881 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 13 May 2017 11:38:30 +0200 Subject: [PATCH 13/20] TST: add test case for multiple consecutive NaTs in PeriodIndex --- pandas/tests/test_resample.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index bba807c8daf7c..ebd383041a63a 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2878,16 +2878,22 @@ def test_upsampling_ohlc(self, freq, period_mult, kind): result = s.resample(freq, kind=kind).ohlc() assert_frame_equal(result, expected) + @pytest.mark.parametrize('periods, values', + [([pd.NaT, '1970-01-01 00:00:00', pd.NaT, + '1970-01-01 00:00:02', '1970-01-01 00:00:03'], + [2, 3, 5, 7, 11]), + ([pd.NaT, pd.NaT, '1970-01-01 00:00:00', pd.NaT, + pd.NaT, pd.NaT, '1970-01-01 00:00:02', + '1970-01-01 00:00:03', pd.NaT, pd.NaT], + [1, 2, 3, 5, 6, 8, 7, 11, 12, 13])]) @pytest.mark.parametrize('freq, expected_values', [('1s', [3, np.NaN, 7, 11]), ('2s', [3, int((7 + 11) / 2)]), ('3s', [int((3 + 7) / 2), 11])]) - def test_resample_with_nat(self, freq, expected_values): + def test_resample_with_nat(self, periods, values, freq, expected_values): # GH 13224 - index = PeriodIndex([pd.NaT, '1970-01-01 00:00:00', pd.NaT, - '1970-01-01 00:00:02', '1970-01-01 00:00:03'], - freq='S') - frame = DataFrame([2, 3, 5, 7, 11], index=index) + index = PeriodIndex(periods, freq='S') + frame = DataFrame(values, index=index) expected_index = period_range('1970-01-01 00:00:00', periods=len(expected_values), freq=freq) From ad8519fbb7fbdfff922d62cf4c0b7d660c02ab2e Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 13 May 2017 11:42:09 +0200 Subject: [PATCH 14/20] TST/DOC: added issue number to test case --- pandas/tests/test_resample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index ebd383041a63a..70d64fb32dcd1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2902,6 +2902,7 @@ def test_resample_with_nat(self, periods, values, freq, expected_values): assert_frame_equal(result, expected) def test_resample_with_only_nat(self): + # GH 13224 pi = PeriodIndex([pd.NaT] * 3, freq='S') frame = DataFrame([2, 3, 5], index=pi) expected_index = PeriodIndex(data=[], freq=pi.freq) From 39fc7e2c2d53233b89118581e2f8486130d65cf7 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 13 May 2017 13:24:23 +0200 Subject: [PATCH 15/20] TST: consolidate test_asfreq_downsample, test_asfreq_upsample -> test_asfreq --- pandas/tests/test_resample.py | 67 +++++++++++------------------------ 1 file changed, 20 insertions(+), 47 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 70d64fb32dcd1..d6c32e867d7d0 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -724,35 +724,19 @@ def series_and_frame(self, request, index): if request.param == DataFrame: return self.frame(index) - def test_asfreq_downsample(self): - s = self.create_series() - - result = s.resample('2D').asfreq() - expected = s.reindex(s.index.take(np.arange(0, len(s.index), 2))) - expected.index.freq = to_offset('2D') - assert_series_equal(result, expected) - - frame = s.to_frame('value') - result = frame.resample('2D').asfreq() - expected = frame.reindex( - frame.index.take(np.arange(0, len(frame.index), 2))) - expected.index.freq = to_offset('2D') - assert_frame_equal(result, expected) - - def test_asfreq_upsample(self): - s = self.create_series() - - result = s.resample('1H').asfreq() - new_index = self.create_index(s.index[0], s.index[-1], freq='1H') - expected = s.reindex(new_index) - assert_series_equal(result, expected) + @pytest.mark.parametrize('freq', ['2D', '1H']) + def test_asfreq(self, series_and_frame, freq): + obj = series_and_frame - frame = s.to_frame('value') - result = frame.resample('1H').asfreq() - new_index = self.create_index(frame.index[0], - frame.index[-1], freq='1H') - expected = frame.reindex(new_index) - assert_frame_equal(result, expected) + result = obj.resample(freq).asfreq() + if freq == '2D': + new_index = obj.index.take(np.arange(0, len(obj.index), 2)) + new_index.freq = to_offset('2D') + else: + new_index = self.create_index(obj.index[0], obj.index[-1], + freq=freq) + expected = obj.reindex(new_index) + assert_almost_equal(result, expected) def test_asfreq_fill_value(self): # test for fill value during resampling, issue 3715 @@ -2250,32 +2234,18 @@ def create_series(self): return Series(np.arange(len(i)), index=i, name='pi') - @pytest.mark.parametrize('freq', ['2D']) + @pytest.mark.parametrize('freq', ['2D', '1H', '2H']) @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) - def test_asfreq_downsample(self, series_and_frame, freq, kind): + def test_asfreq(self, series_and_frame, freq, kind): # GH 12884, 15944 - obj = series_and_frame - start = obj.index[0].to_timestamp(how='start') - end = (obj.index[-1] + 1).to_timestamp(how='start') - if kind == 'timestamp': - expected = obj.to_timestamp().resample(freq).asfreq() - else: - new_index = date_range(start=start, end=end, freq=freq, - closed='left') - expected = obj.to_timestamp().reindex(new_index).to_period(freq) - result = obj.resample(freq, kind=kind).asfreq() - assert_almost_equal(result, expected) + # make sure .asfreq() returns PeriodIndex (except kind='timestamp') - @pytest.mark.parametrize('freq', ['1H', '2H']) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) - def test_asfreq_upsample(self, series_and_frame, freq, kind): - # GH 12884, 15944 obj = series_and_frame - start = obj.index[0].to_timestamp(how='start') - end = (obj.index[-1] + 1).to_timestamp(how='start') if kind == 'timestamp': expected = obj.to_timestamp().resample(freq).asfreq() else: + start = obj.index[0].to_timestamp(how='start') + end = (obj.index[-1] + 1).to_timestamp(how='start') new_index = date_range(start=start, end=end, freq=freq, closed='left') expected = obj.to_timestamp().reindex(new_index).to_period(freq) @@ -2913,6 +2883,9 @@ def test_resample_with_only_nat(self): class TestTimedeltaIndex(Base): _index_factory = lambda x: timedelta_range + _series_fixture_name = 'tdi' + _index_fixture_start = '1 day' + _index_fixture_end = '10 day' def create_series(self): i = timedelta_range('1 day', From efcad5b2133438f22c3f3cbd56ad87ca8f666a1a Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 13 May 2017 13:43:11 +0200 Subject: [PATCH 16/20] TST: set fixtures to default function scoping --- pandas/tests/test_resample.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index d6c32e867d7d0..cf581d42cab5c 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -702,27 +702,28 @@ def create_index(self, *args, **kwargs): _index_fixture_end = datetime(2005, 1, 10) _index_fixture_freq = 'D' - @pytest.fixture(scope='class') + @pytest.fixture def index(self): return self.create_index(self._index_fixture_start, self._index_fixture_end, freq=self._index_fixture_freq) - @pytest.fixture(scope='class') + @pytest.fixture def series(self, index): return Series(np.arange(len(index)), index=index, name=self._series_fixture_name) - @pytest.fixture(scope='class') + @pytest.fixture def frame(self, index): return DataFrame({'value': np.arange(len(index))}, index=index) - @pytest.fixture(params=[Series, DataFrame], scope='class') + @pytest.fixture(params=[Series, DataFrame]) def series_and_frame(self, request, index): if request.param == Series: - return self.series(index) + return Series(np.arange(len(index)), index=index, + name=self._series_fixture_name) if request.param == DataFrame: - return self.frame(index) + return DataFrame({'value': np.arange(len(index))}, index=index) @pytest.mark.parametrize('freq', ['2D', '1H']) def test_asfreq(self, series_and_frame, freq): From 41401d4637ff9fd5bf8650b8ce1c4ed5eabf956e Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sat, 13 May 2017 14:21:22 +0200 Subject: [PATCH 17/20] TST: convert constant 'setup-like' values/objects to pytest fixtures --- pandas/tests/test_resample.py | 71 ++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index cf581d42cab5c..0580226ae8c26 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -18,7 +18,7 @@ from pandas.core.dtypes.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict -from pandas.core.base import SpecificationError +from pandas.core.base import SpecificationError, AbstractMethodError from pandas.errors import UnsupportedFunctionCall from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS @@ -698,32 +698,44 @@ def create_index(self, *args, **kwargs): factory = self._index_factory() return factory(*args, **kwargs) - _index_fixture_start = datetime(2005, 1, 1) - _index_fixture_end = datetime(2005, 1, 10) - _index_fixture_freq = 'D' + @pytest.fixture + def _index_start(self): + return datetime(2005, 1, 1) + + @pytest.fixture + def _index_end(self): + return datetime(2005, 1, 10) + + @pytest.fixture + def _index_freq(self): + return 'D' @pytest.fixture - def index(self): - return self.create_index(self._index_fixture_start, - self._index_fixture_end, - freq=self._index_fixture_freq) + def index(self, _index_start, _index_end, _index_freq): + return self.create_index(_index_start, _index_end, freq=_index_freq) @pytest.fixture - def series(self, index): - return Series(np.arange(len(index)), index=index, - name=self._series_fixture_name) + def _series_name(self): + raise AbstractMethodError(self) @pytest.fixture - def frame(self, index): - return DataFrame({'value': np.arange(len(index))}, index=index) + def _static_values(self, index): + return np.arange(len(index)) + + @pytest.fixture + def series(self, index, _series_name, _static_values): + return Series(_static_values, index=index, name=_series_name) + + @pytest.fixture + def frame(self, index, _static_values): + return DataFrame({'value': _static_values}, index=index) @pytest.fixture(params=[Series, DataFrame]) - def series_and_frame(self, request, index): + def series_and_frame(self, request, index, _series_name, _static_values): if request.param == Series: - return Series(np.arange(len(index)), index=index, - name=self._series_fixture_name) + return Series(_static_values, index=index, name=_series_name) if request.param == DataFrame: - return DataFrame({'value': np.arange(len(index))}, index=index) + return DataFrame({'value': _static_values}, index=index) @pytest.mark.parametrize('freq', ['2D', '1H']) def test_asfreq(self, series_and_frame, freq): @@ -876,7 +888,10 @@ def test_apply_to_empty_series(self): class TestDatetimeIndex(Base): _index_factory = lambda x: date_range - _series_fixture_name = 'dti' + + @pytest.fixture + def _series_name(self): + return 'dti' def setup_method(self, method): dti = DatetimeIndex(start=datetime(2005, 1, 1), @@ -2225,7 +2240,10 @@ def test_resample_datetime_values(self): class TestPeriodIndex(Base): _index_factory = lambda x: period_range - _series_fixture_name = 'pi' + + @pytest.fixture + def _series_name(self): + return 'pi' def create_series(self): # TODO: replace calls to .create_series() by injecting the series @@ -2884,9 +2902,18 @@ def test_resample_with_only_nat(self): class TestTimedeltaIndex(Base): _index_factory = lambda x: timedelta_range - _series_fixture_name = 'tdi' - _index_fixture_start = '1 day' - _index_fixture_end = '10 day' + + @pytest.fixture + def _index_start(self): + return '1 day' + + @pytest.fixture + def _index_end(self): + return '10 day' + + @pytest.fixture + def _series_name(self): + return 'tdi' def create_series(self): i = timedelta_range('1 day', From 398a6845400c56cac39a320418c73babc166c474 Mon Sep 17 00:00:00 2001 From: Andreas Winkler Date: Sun, 21 May 2017 17:46:50 +0200 Subject: [PATCH 18/20] DOC: whatsnew v0.21.0 entry (in API changes section) --- doc/source/whatsnew/v0.21.0.txt | 78 +++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4a3122a78b234..604a9e9551d8d 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -171,6 +171,84 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_0210.api_breaking.period_index_resampling: + +``PeriodIndex`` resampling +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.period_range('2017-01', periods=12, freq='M') + + In [2]: s = pd.Series(np.arange(12), index=pi) + + In [3]: resampled = s.resample('2Q').mean() + + In [4]: resampled + Out[4]: + 2017-03-31 1.0 + 2017-09-30 5.5 + 2018-03-31 10.0 + Freq: 2Q-DEC, dtype: float64 + + In [5]: resampled.index + Out[5]: DatetimeIndex(['2017-03-31', '2017-09-30', '2018-03-31'], dtype='datetime64[ns]', freq='2Q-DEC') + +New Behavior: + +.. ipython:: python + + pi = pd.period_range('2017-01', periods=12, freq='M') + + s = pd.Series(np.arange(12), index=pi) + + resampled = s.resample('2Q').mean() + + resampled + + resampled.index + + +Upsampling and calling ``.ohlc()`` previously returned a ``Series``, basically identical to calling ``.asfreq()``. OHLC upsampling now returns a DataFrame with columns ``open``, ``high``, ``low`` and ``close`` (:issue:`13083`). This is consistent with downsampling and ``DatetimeIndex`` behavior. + +Previous Behavior: + +.. code-block:: ipython + + In [1]: pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + In [2]: s = pd.Series(np.arange(10), index=pi) + + In [3]: s.resample('H').ohlc() + Out[3]: + 2000-01-01 00:00 0.0 + ... + 2000-01-10 23:00 NaN + Freq: H, Length: 240, dtype: float64 + + In [4]: s.resample('M').ohlc() + Out[4]: + open high low close + 2000-01 0 9 0 9 + +New Behavior: + +.. ipython:: python + + pi = pd.PeriodIndex(start='2000-01-01', freq='D', periods=10) + + s = pd.Series(np.arange(10), index=pi) + + s.resample('H').ohlc() + + s.resample('M').ohlc() + + +As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) .. _whatsnew_0210.api_breaking.deps: From 8358c410b65252b7fc9752eefa448ba5620bfcf8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 28 Sep 2017 10:27:14 -0400 Subject: [PATCH 19/20] fixups --- doc/source/whatsnew/v0.21.0.txt | 10 +++++----- pandas/tests/test_resample.py | 8 -------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 604a9e9551d8d..6b6af71623e54 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -189,7 +189,7 @@ Previous Behavior: In [3]: resampled = s.resample('2Q').mean() In [4]: resampled - Out[4]: + Out[4]: 2017-03-31 1.0 2017-09-30 5.5 2018-03-31 10.0 @@ -224,14 +224,14 @@ Previous Behavior: In [2]: s = pd.Series(np.arange(10), index=pi) In [3]: s.resample('H').ohlc() - Out[3]: + Out[3]: 2000-01-01 00:00 0.0 - ... + ... 2000-01-10 23:00 NaN Freq: H, Length: 240, dtype: float64 In [4]: s.resample('M').ohlc() - Out[4]: + Out[4]: open high low close 2000-01 0 9 0 9 @@ -247,7 +247,7 @@ New Behavior: s.resample('M').ohlc() - + As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) .. _whatsnew_0210.api_breaking.deps: diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 0580226ae8c26..cd15203eccd82 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2817,14 +2817,6 @@ def test_evenly_divisible_with_no_extra_bins(self): result = df.resample('7D').sum() assert_frame_equal(result, expected) - def test_apply_to_empty_series(self): - # GH 14313 - series = self.create_series()[:0] - - for freq in ['M', 'D', 'H']: - with pytest.raises(TypeError): - series.resample(freq).apply(lambda x: 1) - @pytest.mark.parametrize('kind', ['period', None, 'timestamp']) @pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']]) def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg): From 6084e0ce24218056de5890d88a1def47b636e63e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 29 Sep 2017 08:22:30 -0400 Subject: [PATCH 20/20] moar whatsnew --- doc/source/whatsnew/v0.21.0.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 6b6af71623e54..eafe8d08aafaa 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -176,7 +176,7 @@ Backwards incompatible API changes ``PeriodIndex`` resampling ^^^^^^^^^^^^^^^^^^^^^^^^^^ -In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). +In previous versions of pandas, resampling a ``Series``/``DataFrame`` indexed by a ``PeriodIndex`` returned a ``DatetimeIndex`` in some cases (:issue:`12884`). Resampling to a multiplied frequency now returns a ``PeriodIndex`` (:issue:`15944`). As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) Previous Behavior: @@ -248,8 +248,6 @@ New Behavior: s.resample('M').ohlc() -As a minor enhancement, resampling a ``PeriodIndex`` can now handle ``NaT`` values (:issue:`13224`) - .. _whatsnew_0210.api_breaking.deps: Dependencies have increased minimum versions