diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 470129d6d860b..0bf85f3e6222c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -370,6 +370,45 @@ In cases with mixed-resolution inputs, the highest resolution is used: In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype Out[2]: dtype(' Index: """ Extract combined index: return intersection or union (depending on the @@ -81,7 +81,8 @@ def get_objs_combined_axis( axis : {0 or 'index', 1 or 'outer'}, default 0 The axis to extract indexes from. sort : bool, default True - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + use for deprecation in GH#57335. Returns ------- @@ -108,7 +109,7 @@ def _get_distinct_objs(objs: list[Index]) -> list[Index]: def _get_combined_index( indexes: list[Index], intersect: bool = False, - sort: bool = False, + sort: bool | lib.NoDefault = False, ) -> Index: """ Return the union or intersection of indexes. @@ -121,7 +122,8 @@ def _get_combined_index( If True, calculate the intersection between indexes. Otherwise, calculate the union. sort : bool, default False - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + used for deprecation of GH#57335 Returns ------- @@ -138,10 +140,10 @@ def _get_combined_index( for other in indexes[1:]: index = index.intersection(other) else: - index = union_indexes(indexes, sort=False) + index = union_indexes(indexes, sort=sort if sort is lib.no_default else False) index = ensure_index(index) - if sort: + if sort and sort is not lib.no_default: index = safe_sort_index(index) return index @@ -180,7 +182,7 @@ def safe_sort_index(index: Index) -> Index: return index -def union_indexes(indexes, sort: bool | None = True) -> Index: +def union_indexes(indexes, sort: bool | None | lib.NoDefault = True) -> Index: """ Return the union of indexes. @@ -190,7 +192,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: ---------- indexes : list of Index or list objects sort : bool, default True - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + used for deprecation of GH#57335. Returns ------- @@ -201,7 +204,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: if len(indexes) == 1: result = indexes[0] if isinstance(result, list): - if not sort: + if not sort or sort is lib.no_default: result = Index(result) else: result = Index(sorted(result)) @@ -227,7 +230,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if num_dtis == len(indexes): - sort = True + if sort is lib.no_default: + sort = True result = indexes[0] elif num_dtis > 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index e39c716784455..e38d3c7deb8df 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -46,6 +46,7 @@ get_objs_combined_axis, get_unanimous_names, ) +from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.internals import concatenate_managers if TYPE_CHECKING: @@ -162,7 +163,7 @@ def concat( levels=None, names: list[HashableT] | None = None, verify_integrity: bool = False, - sort: bool = False, + sort: bool | lib.NoDefault = lib.no_default, copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Series: """ @@ -405,14 +406,43 @@ def concat( "Only can inner (intersect) or outer (union) join the other axis" ) - if not is_bool(sort): - raise ValueError( - f"The 'sort' keyword only accepts boolean values; {sort} was passed." - ) - sort = bool(sort) - objs, keys, ndims = _clean_keys_and_objs(objs, keys) + if sort is lib.no_default: + if axis == 0: + non_concat_axis = [ + obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name]) + for obj in objs + ] + else: + non_concat_axis = [obj.index for obj in objs] + + if all(isinstance(index, DatetimeIndex) for index in non_concat_axis): + warn = any( + id(prev) != id(curr) + for prev, curr in zip(non_concat_axis, non_concat_axis[1:]) + ) and any( + prev[-1] > curr[0] + for prev, curr in zip(non_concat_axis, non_concat_axis[1:]) + if not prev.empty and not curr.empty + ) + if warn: + msg = ( + "Sorting by default when concatenating all DatetimeIndex is " + "deprecated. In the future, pandas will respect the default " + "of `sort=False`. Specify `sort=True` or `sort=False` to " + "silence this message." + ) + warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level()) + else: + sort = False + else: + if not is_bool(sort): + raise ValueError( + f"The 'sort' keyword only accepts boolean values; {sort} was passed." + ) + sort = bool(sort) + # select an object to be our result reference sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 04c584c226aed..78b9676443042 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -263,7 +263,7 @@ def pivot_table( pieces.append(_table) keys.append(getattr(func, "__name__", func)) - table = concat(pieces, keys=keys, axis=1) + table = concat(pieces, keys=keys, axis=1, sort=False) return table.__finalize__(data, method="pivot_table") table = __internal_pivot_table( diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3616a93321358..71485fa7d3169 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1109,7 +1109,9 @@ def func(_start, _stop, _where): ] # concat and return - return concat(objs, axis=axis, verify_integrity=False)._consolidate() + return concat( + objs, axis=axis, verify_integrity=False, sort=False + )._consolidate() # create the iterator it = TableIterator( @@ -4860,7 +4862,7 @@ def read( if len(frames) == 1: df = frames[0] else: - df = concat(frames, axis=1) + df = concat(frames, axis=1, sort=False) selection = Selection(self, where=where, start=start, stop=stop) # apply the selection filters & axis orderings diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 479f2468a86ab..8a933f3afc40f 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -7,6 +7,7 @@ from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 +from pandas.errors import Pandas4Warning import pandas as pd from pandas import ( @@ -887,7 +888,9 @@ def test_append_to_multiple(setup_path): ) df2 = df1.copy().rename(columns="{}_2".format) df2["foo"] = "bar" - df = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + df = concat([df1, df2], axis=1) with ensure_clean_store(setup_path) as store: # exceptions @@ -928,7 +931,9 @@ def test_append_to_multiple_dropna(setup_path): index=date_range("2000-01-01", periods=10, freq="B"), ).rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + df = concat([df1, df2], axis=1) with ensure_clean_store(setup_path) as store: # dropna=True should guarantee rows are synchronized @@ -949,7 +954,9 @@ def test_append_to_multiple_dropna_false(setup_path): ) df2 = df1.copy().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan - df = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + df = concat([df1, df2], axis=1) with ( ensure_clean_store(setup_path) as store, diff --git a/pandas/tests/io/pytables/test_select.py b/pandas/tests/io/pytables/test_select.py index 0dffb284fa6d2..e4aebadb1ce67 100644 --- a/pandas/tests/io/pytables/test_select.py +++ b/pandas/tests/io/pytables/test_select.py @@ -3,6 +3,7 @@ from pandas._libs.tslibs import Timestamp from pandas.compat import PY312 +from pandas.errors import Pandas4Warning import pandas as pd from pandas import ( @@ -411,7 +412,9 @@ def test_select_iterator(tmp_path, setup_path): df2["foo"] = "bar" store.append("df2", df2) - df = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + df = concat([df1, df2], axis=1) # full selection expected = store.select_as_multiple(["df1", "df2"], selector="df1") @@ -901,7 +904,9 @@ def test_select_as_multiple(setup_path): result = store.select_as_multiple( ["df1", "df2"], where=["A>0", "B>0"], selector="df1" ) - expected = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = concat([df1, df2], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds @@ -910,7 +915,9 @@ def test_select_as_multiple(setup_path): result = store.select_as_multiple( ["df1", "df2"], where="index>df2.index[4]", selector="df2" ) - expected = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = concat([df1, df2], axis=1) expected = expected[5:] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 5cfefeb469e8a..5792dbb6a2432 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -8,6 +8,7 @@ import pytest from pandas.compat import PY312 +from pandas.errors import Pandas4Warning import pandas as pd from pandas import ( @@ -732,9 +733,13 @@ def test_coordinates(setup_path): c = store.select_as_coordinates("df1", ["A>0", "B>0"]) df1_result = store.select("df1", c) df2_result = store.select("df2", c) - result = concat([df1_result, df2_result], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + result = concat([df1_result, df2_result], axis=1) - expected = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = concat([df1, df2], axis=1) expected = expected[(expected.A > 0) & (expected.B > 0)] tm.assert_frame_equal(result, expected, check_freq=False) # FIXME: 2021-01-18 on some (mostly windows) builds we get freq=None diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index ab88d221864c0..7f7a48707bb33 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -602,11 +602,13 @@ def test_resample_ohlc_dataframe(unit): df.index = df.index.as_unit(unit) df.columns.name = "Cols" res = df.resample("h").ohlc() - exp = pd.concat( - [df["VOLUME"].resample("h").ohlc(), df["PRICE"].resample("h").ohlc()], - axis=1, - keys=df.columns, - ) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + exp = pd.concat( + [df["VOLUME"].resample("h").ohlc(), df["PRICE"].resample("h").ohlc()], + axis=1, + keys=df.columns, + ) assert exp.columns.names[0] == "Cols" tm.assert_frame_equal(exp, res) diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 845b5ad7acc00..de3ea0fb36b78 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -6,6 +6,7 @@ from pandas._libs import lib from pandas._libs.tslibs import Day +from pandas.errors import Pandas4Warning import pandas as pd from pandas import ( @@ -440,13 +441,16 @@ def cases(request): def test_agg_mixed_column_aggregation(cases, a_mean, a_std, b_mean, b_std, request): - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", ""]]) # "date" is an index and a column, so get included in the agg if "df_mult" in request.node.callspec.id: date_mean = cases["date"].mean() date_std = cases["date"].std() - expected = pd.concat([date_mean, date_std, expected], axis=1) + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = pd.concat([date_mean, date_std, expected], axis=1) expected.columns = pd.MultiIndex.from_product( [["date", "A", "B"], ["mean", ""]] ) @@ -463,13 +467,17 @@ def test_agg_mixed_column_aggregation(cases, a_mean, a_std, b_mean, b_std, reque ], ) def test_agg_both_mean_std_named_result(cases, a_mean, b_std, agg): - expected = pd.concat([a_mean, b_std], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = pd.concat([a_mean, b_std], axis=1) result = cases.aggregate(**agg) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_both_mean_std_dict_of_list(cases, a_mean, a_std): - expected = pd.concat([a_mean, a_std], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) result = cases.aggregate({"A": ["mean", "std"]}) tm.assert_frame_equal(result, expected) @@ -479,7 +487,9 @@ def test_agg_both_mean_std_dict_of_list(cases, a_mean, a_std): "agg", [{"func": ["mean", "sum"]}, {"mean": "mean", "sum": "sum"}] ) def test_agg_both_mean_sum(cases, a_mean, a_sum, agg): - expected = pd.concat([a_mean, a_sum], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ["mean", "sum"] result = cases["A"].aggregate(**agg) tm.assert_frame_equal(result, expected) @@ -502,7 +512,9 @@ def test_agg_dict_of_dict_specificationerror(cases, agg): def test_agg_dict_of_lists(cases, a_mean, a_std, b_mean, b_std): - expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")] ) @@ -556,7 +568,9 @@ def test_agg_no_column(cases, agg): def test_agg_specificationerror_nested(cases, cols, agg, a_sum, a_std, b_mean, b_std): # agg with different hows # equivalent of using a selection list / or not - expected = pd.concat([a_sum, a_std, b_mean, b_std], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + expected = pd.concat([a_sum, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples( [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] ) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2d0eb5d14a1d9..d4ad0e3b075e9 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,7 +10,10 @@ import numpy as np import pytest -from pandas.errors import InvalidIndexError +from pandas.errors import ( + InvalidIndexError, + Pandas4Warning, +) import pandas as pd from pandas import ( @@ -433,8 +436,11 @@ def test_concat_bug_1719(self): # to join with union # these two are of different length! - left = concat([ts1, ts2], join="outer", axis=1) - right = concat([ts2, ts1], join="outer", axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + left = concat([ts1, ts2], join="outer", axis=1) + with tm.assert_produces_warning(Pandas4Warning, match=msg): + right = concat([ts2, ts1], join="outer", axis=1) assert len(left) == len(right) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 0cf3192ea3a74..12ef453f46b60 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import Pandas4Warning + import pandas as pd from pandas import ( DataFrame, @@ -50,7 +52,9 @@ def test_concat_datetime_timezone(self): idx2 = date_range(start=idx1[0], end=idx1[-1], freq="h") df1 = DataFrame({"a": [1, 2, 3]}, index=idx1) df2 = DataFrame({"b": [1, 2, 3]}, index=idx2) - result = concat([df1, df2], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + result = concat([df1, df2], axis=1) exp_idx = DatetimeIndex( [ @@ -69,7 +73,9 @@ def test_concat_datetime_timezone(self): idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo") df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) - result = concat([df1, df3], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + result = concat([df1, df3], axis=1) exp_idx = DatetimeIndex( [ diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index 3523340bb2858..85aa243000434 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import Pandas4Warning + from pandas import ( DataFrame, DatetimeIndex, @@ -54,11 +56,15 @@ def test_concat_series_axis1(self): pieces = [ts[:-2], ts[2:], ts[2:-2]] - result = concat(pieces, axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + result = concat(pieces, axis=1) expected = DataFrame(pieces).T tm.assert_frame_equal(result, expected) - result = concat(pieces, keys=["A", "B", "C"], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + result = concat(pieces, keys=["A", "B", "C"], axis=1) expected = DataFrame(pieces, index=["A", "B", "C"]).T tm.assert_frame_equal(result, expected)