diff --git a/doc/source/styled.xlsx b/doc/source/styled.xlsx new file mode 100644 index 00000000000000..1233ff2b8692ba Binary files /dev/null and b/doc/source/styled.xlsx differ diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 8f98566ceae561..728f762e091669 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -23,6 +23,7 @@ Other Enhancements - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`) - Add ``casefold`` to ``Series.str`` (:issue:`25405`) +- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1037e2d9a3bd63..a6697e8879b08b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -460,7 +460,7 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def union(self, other): + def union(self, other, sort=None): """ Specialized union for DatetimeIndex objects. If combine overlapping ranges with the same DateOffset, will be much @@ -469,15 +469,29 @@ def union(self, other): Parameters ---------- other : DatetimeIndex or array-like + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + 3. Some values in `self` or `other` cannot be compared. + A RuntimeWarning is issued in this case. + + * False : do not sort the result + + .. versionadded:: 0.25.0 Returns ------- y : Index or DatetimeIndex """ + self._validate_sort_keyword(sort) self._assert_can_do_setop(other) if len(other) == 0 or self.equals(other) or len(self) == 0: - return super(DatetimeIndex, self).union(other) + return super(DatetimeIndex, self).union(other, sort=sort) if not isinstance(other, DatetimeIndex): try: @@ -488,9 +502,9 @@ def union(self, other): this, other = self._maybe_utc_convert(other) if this._can_fast_union(other): - return this._fast_union(other) + return this._fast_union(other, sort=sort) else: - result = Index.union(this, other) + result = Index.union(this, other, sort=sort) if isinstance(result, DatetimeIndex): # TODO: we shouldn't be setting attributes like this; # in all the tests this equality already holds @@ -563,16 +577,28 @@ def _can_fast_union(self, other): # this will raise return False - def _fast_union(self, other): + def _fast_union(self, other, sort=None): if len(other) == 0: return self.view(type(self)) if len(self) == 0: return other.view(type(self)) - # to make our life easier, "sort" the two ranges + # Both DTIs are monotonic. Check if they are already + # in the "correct" order if self[0] <= other[0]: left, right = self, other + # DTIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + elif sort is False: + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side='left') + right_chunk = right.values[:loc] + dates = _concat._concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + # DTIs are not in the "correct" order and we want + # to sort else: left, right = other, self diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index c57d27ff03ac6a..493ee65f63c6ac 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -32,6 +32,8 @@ class UnsortedIndexError(KeyError): class ParserError(ValueError): """ Exception that is raised by an error encountered in `pd.read_csv`. + + e.g. HTML Parsing will raise this error. """ diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 19009e45ee83a4..cf1f75234ec621 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -21,83 +21,107 @@ class TestDatetimeIndexSetOps(object): 'dateutil/US/Pacific'] # TODO: moved from test_datetimelike; dedup with version below - def test_union2(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union2(self, sort): everything = tm.makeDateIndex(10) first = everything[:5] second = everything[5:] - union = first.union(second) - assert tm.equalContents(union, everything) + union = first.union(second, sort=sort) + tm.assert_index_equal(union, everything) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - result = first.union(case) - assert tm.equalContents(result, everything) + result = first.union(case, sort=sort) + tm.assert_index_equal(result, everything) @pytest.mark.parametrize("tz", tz) - def test_union(self, tz): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, tz, sort): rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other1 = pd.date_range('1/6/2000', freq='D', periods=5, tz=tz) expected1 = pd.date_range('1/1/2000', freq='D', periods=10, tz=tz) + expected1_notsorted = pd.DatetimeIndex(list(other1) + list(rng1)) rng2 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other2 = pd.date_range('1/4/2000', freq='D', periods=5, tz=tz) expected2 = pd.date_range('1/1/2000', freq='D', periods=8, tz=tz) + expected2_notsorted = pd.DatetimeIndex(list(other2) + list(rng2[:3])) rng3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) other3 = pd.DatetimeIndex([], tz=tz) expected3 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) + expected3_notsorted = rng3 - for rng, other, expected in [(rng1, other1, expected1), - (rng2, other2, expected2), - (rng3, other3, expected3)]: + for rng, other, exp, exp_notsorted in [(rng1, other1, expected1, + expected1_notsorted), + (rng2, other2, expected2, + expected2_notsorted), + (rng3, other3, expected3, + expected3_notsorted)]: - result_union = rng.union(other) - tm.assert_index_equal(result_union, expected) + result_union = rng.union(other, sort=sort) + tm.assert_index_equal(result_union, exp) - def test_union_coverage(self): + result_union = other.union(rng, sort=sort) + if sort is None: + tm.assert_index_equal(result_union, exp) + else: + tm.assert_index_equal(result_union, exp_notsorted) + + @pytest.mark.parametrize("sort", [None, False]) + def test_union_coverage(self, sort): idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) ordered = DatetimeIndex(idx.sort_values(), freq='infer') - result = ordered.union(idx) + result = ordered.union(idx, sort=sort) tm.assert_index_equal(result, ordered) - result = ordered[:0].union(ordered) + result = ordered[:0].union(ordered, sort=sort) tm.assert_index_equal(result, ordered) assert result.freq == ordered.freq - def test_union_bug_1730(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1730(self, sort): rng_a = date_range('1/1/2012', periods=4, freq='3H') rng_b = date_range('1/1/2012', periods=4, freq='4H') - result = rng_a.union(rng_b) + result = rng_a.union(rng_b, sort=sort) exp = DatetimeIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) tm.assert_index_equal(result, exp) - def test_union_bug_1745(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_1745(self, sort): left = DatetimeIndex(['2012-05-11 15:19:49.695000']) right = DatetimeIndex(['2012-05-29 13:04:21.322000', '2012-05-11 15:27:24.873000', '2012-05-11 15:31:05.350000']) - result = left.union(right) - exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) + result = left.union(right, sort=sort) + exp = DatetimeIndex(['2012-05-11 15:19:49.695000', + '2012-05-29 13:04:21.322000', + '2012-05-11 15:27:24.873000', + '2012-05-11 15:31:05.350000']) + if sort is None: + exp = exp.sort_values() tm.assert_index_equal(result, exp) - def test_union_bug_4564(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_bug_4564(self, sort): from pandas import DateOffset left = date_range("2013-01-01", "2013-02-01") right = left + DateOffset(minutes=15) - result = left.union(right) + result = left.union(right, sort=sort) exp = DatetimeIndex(sorted(set(list(left)) | set(list(right)))) tm.assert_index_equal(result, exp) - def test_union_freq_both_none(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_freq_both_none(self, sort): # GH11086 expected = bdate_range('20150101', periods=10) expected.freq = None - result = expected.union(expected) + result = expected.union(expected, sort=sort) tm.assert_index_equal(result, expected) assert result.freq is None @@ -112,11 +136,14 @@ def test_union_dataframe_index(self): exp = pd.date_range('1/1/1980', '1/1/2012', freq='MS') tm.assert_index_equal(df.index, exp) - def test_union_with_DatetimeIndex(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_with_DatetimeIndex(self, sort): i1 = Int64Index(np.arange(0, 20, 2)) i2 = date_range(start='2012-01-03 00:00:00', periods=10, freq='D') - i1.union(i2) # Works - i2.union(i1) # Fails with "AttributeError: can't set attribute" + # Works + i1.union(i2, sort=sort) + # Fails with "AttributeError: can't set attribute" + i2.union(i1, sort=sort) # TODO: moved from test_datetimelike; de-duplicate with version below def test_intersection2(self): @@ -262,11 +289,12 @@ def test_datetimeindex_diff(self, sort): periods=98) assert len(dti1.difference(dti2, sort)) == 2 - def test_datetimeindex_union_join_empty(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_datetimeindex_union_join_empty(self, sort): dti = date_range(start='1/1/2001', end='2/1/2001', freq='D') empty = Index([]) - result = dti.union(empty) + result = dti.union(empty, sort=sort) assert isinstance(result, DatetimeIndex) assert result is result @@ -287,35 +315,40 @@ class TestBusinessDatetimeIndex(object): def setup_method(self, method): self.rng = bdate_range(START, END) - def test_union(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): # overlapping left = self.rng[:10] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, Index) # non-overlapping, no gap left = self.rng[:5] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # order does not matter - tm.assert_index_equal(right.union(left), the_union) + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) + else: + expected = pd.DatetimeIndex(list(right) + list(left)) + tm.assert_index_equal(right.union(left, sort=sort), expected) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_union = self.rng.union(rng) + the_union = self.rng.union(rng, sort=sort) assert isinstance(the_union, DatetimeIndex) def test_outer_join(self): @@ -350,16 +383,21 @@ def test_outer_join(self): assert isinstance(the_join, DatetimeIndex) assert the_join.freq is None - def test_union_not_cacheable(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union_not_cacheable(self, sort): rng = date_range('1/1/2000', periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] - the_union = rng1.union(rng2) - tm.assert_index_equal(the_union, rng) + the_union = rng1.union(rng2, sort=sort) + if sort is None: + tm.assert_index_equal(the_union, rng) + else: + expected = pd.DatetimeIndex(list(rng[10:]) + list(rng[:10])) + tm.assert_index_equal(the_union, expected) rng1 = rng[10:] rng2 = rng[15:35] - the_union = rng1.union(rng2) + the_union = rng1.union(rng2, sort=sort) expected = rng[10:] tm.assert_index_equal(the_union, expected) @@ -388,7 +426,8 @@ def test_intersection_bug(self): result = a.intersection(b) tm.assert_index_equal(result, b) - def test_month_range_union_tz_pytz(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_pytz(self, sort): from pytz import timezone tz = timezone('US/Eastern') @@ -403,10 +442,11 @@ def test_month_range_union_tz_pytz(self): late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) - early_dr.union(late_dr) + early_dr.union(late_dr, sort=sort) @td.skip_if_windows_python_3 - def test_month_range_union_tz_dateutil(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_month_range_union_tz_dateutil(self, sort): from pandas._libs.tslibs.timezones import dateutil_gettz tz = dateutil_gettz('US/Eastern') @@ -421,7 +461,7 @@ def test_month_range_union_tz_dateutil(self): late_dr = date_range(start=late_start, end=late_end, tz=tz, freq=MonthEnd()) - early_dr.union(late_dr) + early_dr.union(late_dr, sort=sort) class TestCustomDatetimeIndex(object): @@ -429,35 +469,37 @@ class TestCustomDatetimeIndex(object): def setup_method(self, method): self.rng = bdate_range(START, END, freq='C') - def test_union(self): + @pytest.mark.parametrize("sort", [None, False]) + def test_union(self, sort): # overlapping left = self.rng[:10] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # non-overlapping, gap in middle left = self.rng[:5] right = self.rng[10:] - the_union = left.union(right) + the_union = left.union(right, sort) assert isinstance(the_union, Index) # non-overlapping, no gap left = self.rng[:5] right = self.rng[5:10] - the_union = left.union(right) + the_union = left.union(right, sort=sort) assert isinstance(the_union, DatetimeIndex) # order does not matter - tm.assert_index_equal(right.union(left), the_union) + if sort is None: + tm.assert_index_equal(right.union(left, sort=sort), the_union) # overlapping, but different offset rng = date_range(START, END, freq=BMonthEnd()) - the_union = self.rng.union(rng) + the_union = self.rng.union(rng, sort=sort) assert isinstance(the_union, DatetimeIndex) def test_outer_join(self):