From 4c0a1818ef84cea9abda8a045aa7567ae9def774 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Mon, 12 Jun 2017 14:16:53 +0100 Subject: [PATCH 1/7] Fixes GH16624 --- doc/source/whatsnew/v0.20.3.txt | 2 +- .../tests/indexes/timedeltas/test_timedelta.py | 2 +- pandas/tests/tseries/test_frequencies.py | 7 ++++++- pandas/tseries/frequencies.py | 16 ++++++++++++++-- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index f21230693686e..3147c433fa838 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -66,7 +66,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - +- Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) Sparse ^^^^^^ diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 79fe0a864f246..e719d2d6fbcc2 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -567,7 +567,7 @@ class TestSlicing(object): def test_timedelta(self): # this is valid too - index = date_range('1/1/2000', periods=50, freq='B') + index = date_range('1/1/2000', periods=50, freq='D') shifted = index + timedelta(1) back = shifted + timedelta(-1) assert tm.equalContents(index, back) diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 2edca1bd4676b..9dd45f9515360 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -504,9 +504,14 @@ def test_raise_if_too_few(self): pytest.raises(ValueError, frequencies.infer_freq, index) def test_business_daily(self): - index = _dti(['12/31/1998', '1/3/1999', '1/4/1999']) + index = _dti(['01/01/1999', '1/4/1999', '1/5/1999']) assert frequencies.infer_freq(index) == 'B' + def test_business_daily_look_alike(self): + # 'weekend' (2-day gap) in wrong place + index = _dti(['12/31/1998', '1/3/1999', '1/4/1999']) + assert frequencies.infer_freq(index) is None + def test_day(self): self._check_tick(timedelta(1), 'D') diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index dddf835424f67..3a4ea94ced2b9 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -975,8 +975,7 @@ def _infer_daily_rule(self): else: return _maybe_add_count('D', days) - # Business daily. Maybe - if self.day_deltas == [1, 3]: + if self._is_business_daily(): return 'B' wom_rule = self._get_wom_rule() @@ -1012,6 +1011,19 @@ def _get_monthly_rule(self): return {'cs': 'MS', 'bs': 'BMS', 'ce': 'M', 'be': 'BM'}.get(pos_check) + WORKING_DAY_SHIFTS = set([(0, 1), (1, 2), (2, 3), (3, 4), (4, 0)]) + + def _is_business_daily(self): + if self.day_deltas != [1, 3]: # quick check: cannot be business daily + return False + # probably business daily, but need to confirm + first_weekday = self.index[0].weekday() + shifts = np.diff(np.asarray(self.index).view('i8')) + shifts = np.floor_divide(shifts, _ONE_DAY) + weekdays = np.mod(first_weekday + np.cumsum(shifts), 7) + return np.all(((weekdays == 0) & (shifts == 3)) | + ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))) + def _get_wom_rule(self): # wdiffs = unique(np.diff(self.index.week)) # We also need -47, -49, -48 to catch index spanning year boundary From 310d412a7f9225cfc878853a5a26f9d9588c79e0 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Mon, 12 Jun 2017 14:47:30 +0100 Subject: [PATCH 2/7] Remove unused code --- pandas/tseries/frequencies.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3a4ea94ced2b9..8714f92b595b6 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1011,8 +1011,6 @@ def _get_monthly_rule(self): return {'cs': 'MS', 'bs': 'BMS', 'ce': 'M', 'be': 'BM'}.get(pos_check) - WORKING_DAY_SHIFTS = set([(0, 1), (1, 2), (2, 3), (3, 4), (4, 0)]) - def _is_business_daily(self): if self.day_deltas != [1, 3]: # quick check: cannot be business daily return False From 84ca63afe9393f78effdac7c0452d815002e9512 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Thu, 6 Jul 2017 14:50:10 +0100 Subject: [PATCH 3/7] TST: timedelta arithmetic on DatetimeIndex with 'B' and 'D' frequency --- .../indexes/timedeltas/test_timedelta.py | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index e719d2d6fbcc2..da7dbf25fa241 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -567,26 +567,34 @@ class TestSlicing(object): def test_timedelta(self): # this is valid too - index = date_range('1/1/2000', periods=50, freq='D') - shifted = index + timedelta(1) - back = shifted + timedelta(-1) - assert tm.equalContents(index, back) - assert shifted.freq == index.freq - assert shifted.freq == back.freq - - result = index - timedelta(1) - expected = index + timedelta(-1) - tm.assert_index_equal(result, expected) - - # GH4134, buggy with timedeltas - rng = date_range('2013', '2014') - s = Series(rng) - result1 = rng - pd.offsets.Hour(1) - result2 = DatetimeIndex(s - np.timedelta64(100000000)) - result3 = rng - np.timedelta64(100000000) - result4 = DatetimeIndex(s - pd.offsets.Hour(1)) - tm.assert_index_equal(result1, result4) - tm.assert_index_equal(result2, result3) + for freq in ['D', 'B']: + index = date_range('1/1/2000', periods=50, freq=freq) + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + assert tm.equalContents(index, back) + if freq == 'D': + expected = pd.tseries.offsets.Day(1) + assert index.freq == expected + assert shifted.freq == expected + assert back.freq == expected + else: # freq == 'B' + assert index.freq == pd.tseries.offsets.BusinessDay(1) + assert shifted.freq == None + assert back.freq == pd.tseries.offsets.BusinessDay(1) + + result = index - timedelta(1) + expected = index + timedelta(-1) + tm.assert_index_equal(result, expected) + + # GH4134, buggy with timedeltas + rng = date_range('2013', '2014') + s = Series(rng) + result1 = rng - pd.offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - pd.offsets.Hour(1)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) class TestTimeSeries(object): From b79733abb125a4395a4459dcdc70ac500ed2eaf4 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Thu, 6 Jul 2017 15:18:14 +0100 Subject: [PATCH 4/7] ASV: test performance of infer_freq on business daily index --- asv_bench/benchmarks/timeseries.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index f5ea4d7875931..f083be06a1173 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -55,6 +55,9 @@ def setup(self): self.rng7 = date_range(start='1/1/1700', freq='D', periods=100000) self.a = self.rng7[:50000].append(self.rng7[50002:]) + self.rng8 = date_range(start='1/1/1700', freq='B', periods=100000) + self.b = self.rng8[:50000].append(self.rng8[50000:]) + def time_add_timedelta(self): (self.rng + dt.timedelta(minutes=2)) @@ -94,9 +97,12 @@ def time_infer_dst(self): def time_timeseries_is_month_start(self): self.rng6.is_month_start - def time_infer_freq(self): + def time_infer_freq_daily(self): infer_freq(self.a) + def time_infer_freq_business(self): + infer_freq(self.b) + class TimeDatetimeConverter(object): goal_time = 0.2 From 60f55a702c141feeb5fb7bae32a382454a7d5f4b Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Thu, 6 Jul 2017 17:18:18 +0100 Subject: [PATCH 5/7] ASV: more nuanced tests of infer_freq --- asv_bench/benchmarks/timeseries.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index f083be06a1173..efe713639fec9 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -53,10 +53,11 @@ def setup(self): self.rng6 = date_range(start='1/1/1', periods=self.N, freq='B') self.rng7 = date_range(start='1/1/1700', freq='D', periods=100000) - self.a = self.rng7[:50000].append(self.rng7[50002:]) + self.no_freq = self.rng7[:50000].append(self.rng7[50002:]) + self.d_freq = self.rng7[:50000].append(self.rng7[50000:]) self.rng8 = date_range(start='1/1/1700', freq='B', periods=100000) - self.b = self.rng8[:50000].append(self.rng8[50000:]) + self.b_freq = self.rng8[:50000].append(self.rng8[50000:]) def time_add_timedelta(self): (self.rng + dt.timedelta(minutes=2)) @@ -97,11 +98,14 @@ def time_infer_dst(self): def time_timeseries_is_month_start(self): self.rng6.is_month_start + def time_infer_freq_none(self): + infer_freq(self.no_freq) + def time_infer_freq_daily(self): - infer_freq(self.a) + infer_freq(self.d_freq) def time_infer_freq_business(self): - infer_freq(self.b) + infer_freq(self.b_freq) class TimeDatetimeConverter(object): From d3932dd128d1acec1a8c5c16fa289e05262b6257 Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Fri, 7 Jul 2017 09:01:32 +0100 Subject: [PATCH 6/7] Address jreback's review whatsnew: bump from 0.20.3 to 0.21.0 test_timedelta: parametrize, strengthen index check test_frequencies: add issue number frequencies: formatting, np.asarray -> asi8 --- doc/source/whatsnew/v0.20.3.txt | 1 - doc/source/whatsnew/v0.21.0.txt | 2 +- .../indexes/timedeltas/test_timedelta.py | 60 +++++++++---------- pandas/tests/tseries/test_frequencies.py | 2 +- pandas/tseries/frequencies.py | 6 +- 5 files changed, 36 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index 3147c433fa838..a3a44feb55c7c 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -66,7 +66,6 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) Sparse ^^^^^^ diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 36ca79e8b8714..11b31a7f4c55c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -113,7 +113,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - +- Bug in ``infer_freq`` causing indices with 2-day gaps during the working week to be wrongly inferred as business daily (:issue:`16624`) Sparse ^^^^^^ diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index da7dbf25fa241..585e85bd11c46 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -564,37 +564,37 @@ def test_freq_conversion(self): class TestSlicing(object): + @pytest.mark.parametrize('freq', ['B', 'D']) + def test_timedelta(self, freq): + index = date_range('1/1/2000', periods=50, freq=freq) + + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + tm.assert_index_equal(index, back) + + if freq == 'D': + expected = pd.tseries.offsets.Day(1) + assert index.freq == expected + assert shifted.freq == expected + assert back.freq == expected + else: # freq == 'B' + assert index.freq == pd.tseries.offsets.BusinessDay(1) + assert shifted.freq == None + assert back.freq == pd.tseries.offsets.BusinessDay(1) + + result = index - timedelta(1) + expected = index + timedelta(-1) + tm.assert_index_equal(result, expected) - def test_timedelta(self): - # this is valid too - for freq in ['D', 'B']: - index = date_range('1/1/2000', periods=50, freq=freq) - shifted = index + timedelta(1) - back = shifted + timedelta(-1) - assert tm.equalContents(index, back) - if freq == 'D': - expected = pd.tseries.offsets.Day(1) - assert index.freq == expected - assert shifted.freq == expected - assert back.freq == expected - else: # freq == 'B' - assert index.freq == pd.tseries.offsets.BusinessDay(1) - assert shifted.freq == None - assert back.freq == pd.tseries.offsets.BusinessDay(1) - - result = index - timedelta(1) - expected = index + timedelta(-1) - tm.assert_index_equal(result, expected) - - # GH4134, buggy with timedeltas - rng = date_range('2013', '2014') - s = Series(rng) - result1 = rng - pd.offsets.Hour(1) - result2 = DatetimeIndex(s - np.timedelta64(100000000)) - result3 = rng - np.timedelta64(100000000) - result4 = DatetimeIndex(s - pd.offsets.Hour(1)) - tm.assert_index_equal(result1, result4) - tm.assert_index_equal(result2, result3) + # GH4134, buggy with timedeltas + rng = date_range('2013', '2014') + s = Series(rng) + result1 = rng - pd.offsets.Hour(1) + result2 = DatetimeIndex(s - np.timedelta64(100000000)) + result3 = rng - np.timedelta64(100000000) + result4 = DatetimeIndex(s - pd.offsets.Hour(1)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) class TestTimeSeries(object): diff --git a/pandas/tests/tseries/test_frequencies.py b/pandas/tests/tseries/test_frequencies.py index 9dd45f9515360..54d12317b0bf8 100644 --- a/pandas/tests/tseries/test_frequencies.py +++ b/pandas/tests/tseries/test_frequencies.py @@ -508,7 +508,7 @@ def test_business_daily(self): assert frequencies.infer_freq(index) == 'B' def test_business_daily_look_alike(self): - # 'weekend' (2-day gap) in wrong place + # GH 16624, do not infer 'B' when 'weekend' (2-day gap) in wrong place index = _dti(['12/31/1998', '1/3/1999', '1/4/1999']) assert frequencies.infer_freq(index) is None diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 8714f92b595b6..8640f106a048a 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1012,11 +1012,13 @@ def _get_monthly_rule(self): 'ce': 'M', 'be': 'BM'}.get(pos_check) def _is_business_daily(self): - if self.day_deltas != [1, 3]: # quick check: cannot be business daily + # quick check: cannot be business daily + if self.day_deltas != [1, 3]: return False + # probably business daily, but need to confirm first_weekday = self.index[0].weekday() - shifts = np.diff(np.asarray(self.index).view('i8')) + shifts = np.diff(self.index.asi8) shifts = np.floor_divide(shifts, _ONE_DAY) weekdays = np.mod(first_weekday + np.cumsum(shifts), 7) return np.all(((weekdays == 0) & (shifts == 3)) | From 6042aa4148e314a5973c8c313a2e68168bd2de2e Mon Sep 17 00:00:00 2001 From: Adam Gleave Date: Fri, 7 Jul 2017 11:30:18 +0100 Subject: [PATCH 7/7] TST: fix lint error --- pandas/tests/indexes/timedeltas/test_timedelta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 585e85bd11c46..08cf5108ffdb1 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -579,7 +579,7 @@ def test_timedelta(self, freq): assert back.freq == expected else: # freq == 'B' assert index.freq == pd.tseries.offsets.BusinessDay(1) - assert shifted.freq == None + assert shifted.freq is None assert back.freq == pd.tseries.offsets.BusinessDay(1) result = index - timedelta(1)