From ed78f3dcc2a0c1f5a563598b2346290a595a0ed9 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Nov 2025 13:35:39 -0400 Subject: [PATCH 1/5] BUG: Fix sorting behavior of DataFrame.join on list arguments --- pandas/core/frame.py | 10 ++++++++-- pandas/tests/frame/methods/test_join.py | 9 +-------- pandas/tests/reshape/merge/test_join.py | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f62b5c07a5cf..01a869b23d688 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11389,7 +11389,7 @@ def join( # "Iterable[Union[DataFrame, Series]]" due to the if statements frames = [cast("DataFrame | Series", self)] + list(other) - can_concat = all(df.index.is_unique for df in frames) + can_concat = (how != "right") and all(df.index.is_unique for df in frames) # join indexes only using concat if can_concat: @@ -11397,8 +11397,13 @@ def join( res = concat( frames, axis=1, join="outer", verify_integrity=True, sort=sort ) - return res.reindex(self.index) + result = res.reindex(self.index) + if sort: + result = result.sort_index() + return result else: + if how == "outer": + sort = True return concat( frames, axis=1, join=how, verify_integrity=True, sort=sort ) @@ -11409,6 +11414,7 @@ def join( joined = merge( joined, frame, + sort=sort, how=how, left_index=True, right_index=True, diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index aaa9485cab580..c2fb20ee4d0d0 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -396,11 +396,8 @@ def test_join_list_series(float_frame): def test_suppress_future_warning_with_sort_kw(sort): - sort_kw = sort a = DataFrame({"col1": [1, 2]}, index=["c", "a"]) - b = DataFrame({"col2": [4, 5]}, index=["b", "a"]) - c = DataFrame({"col3": [7, 8]}, index=["a", "b"]) expected = DataFrame( @@ -410,11 +407,7 @@ def test_suppress_future_warning_with_sort_kw(sort): "col3": {"a": 7.0, "b": 8.0, "c": float("nan")}, } ) - if sort_kw is False: - expected = expected.reindex(index=["c", "a", "b"]) - - with tm.assert_produces_warning(None): - result = a.join([b, c], how="outer", sort=sort_kw) + result = a.join([b, c], how="outer", sort=sort) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 65bfea0b9beea..9767dd88ad005 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -671,6 +671,20 @@ def _check_diff_index(df_list, result, exp_index): with pytest.raises(ValueError, match=msg): df_list[0].join(df_list[1:], on="a") + @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) + def test_join_many_sort(self, how, sort): + df = DataFrame({"a": [1, 2, 3]}, index=[1, 0, 2]) + df2 = DataFrame({"b": [4, 5, 6]}, index=[2, 0, 1]) + if how == "right": + expected = DataFrame({"a": [3, 2, 1], "b": [4, 5, 6]}, index=[2, 0, 1]) + else: + expected = DataFrame({"a": [1, 2, 3], "b": [6, 5, 4]}, index=[1, 0, 2]) + if how == "outer" or sort: + # outer always sorts. + expected = expected.sort_index() + result = df.join([df2], how=how, sort=sort) + tm.assert_frame_equal(result, expected) + def test_join_many_mixed(self): df = DataFrame( np.random.default_rng(2).standard_normal((8, 4)), From d9541670e2ff5f3d9ecaff502874d4703d328589 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Nov 2025 13:54:04 -0400 Subject: [PATCH 2/5] Improve implementation & add another test --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 9 ++++---- pandas/tests/reshape/merge/test_join.py | 29 ++++++++++++++++++++++++- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 12f522301e121..74f73486ef452 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1183,6 +1183,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ +- Bug in :func:`DataFrame.join` not producing the correct row order when joining with a list of Series/DataFrames (:issue:`62954) - Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`) - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 01a869b23d688..c9c588059f788 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -11389,17 +11389,18 @@ def join( # "Iterable[Union[DataFrame, Series]]" due to the if statements frames = [cast("DataFrame | Series", self)] + list(other) - can_concat = (how != "right") and all(df.index.is_unique for df in frames) + can_concat = all(df.index.is_unique for df in frames) # join indexes only using concat if can_concat: - if how == "left": + if how == "left" or how == "right": res = concat( frames, axis=1, join="outer", verify_integrity=True, sort=sort ) - result = res.reindex(self.index) + index = self.index if how == "left" else frames[-1].index if sort: - result = result.sort_index() + index = index.sort_values() + result = res.reindex(index) return result else: if how == "outer": diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 9767dd88ad005..5692cd6e267be 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -672,7 +672,7 @@ def _check_diff_index(df_list, result, exp_index): df_list[0].join(df_list[1:], on="a") @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) - def test_join_many_sort(self, how, sort): + def test_join_many_sort_unique(self, how, sort): df = DataFrame({"a": [1, 2, 3]}, index=[1, 0, 2]) df2 = DataFrame({"b": [4, 5, 6]}, index=[2, 0, 1]) if how == "right": @@ -685,6 +685,33 @@ def test_join_many_sort(self, how, sort): result = df.join([df2], how=how, sort=sort) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) + def test_join_many_sort_nonunique(self, how, sort): + df = DataFrame({"a": [1, 2, 3]}, index=[3, 0, 0]) + df2 = DataFrame({"b": [4, 5, 6]}, index=[2, 0, 1]) + if how == "inner": + expected = DataFrame({"a": [2, 3], "b": [5, 5]}, index=[0, 0]) + elif how == "left": + expected = DataFrame( + {"a": [1, 2, 3], "b": [np.nan, 5.0, 5.0]}, index=[3, 0, 0] + ) + elif how == "right": + expected = DataFrame( + {"a": [np.nan, 2.0, 3.0, np.nan], "b": [4, 5, 5, 6]}, index=[2, 0, 0, 1] + ) + else: + expected = DataFrame( + { + "a": [2.0, 3.0, np.nan, np.nan, 1.0], + "b": [5.0, 5.0, 6.0, 4.0, np.nan], + }, + index=[0, 0, 1, 2, 3], + ) + if sort: + expected = expected.sort_index() + result = df.join([df2], how=how, sort=sort) + tm.assert_frame_equal(result, expected) + def test_join_many_mixed(self): df = DataFrame( np.random.default_rng(2).standard_normal((8, 4)), From 4a173fb1c7c0a9965c8f6ccc5c8cac483ea6fa71 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Nov 2025 13:55:18 -0400 Subject: [PATCH 3/5] Remove test --- pandas/tests/frame/methods/test_join.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index c2fb20ee4d0d0..b2579bbcc0e44 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -395,22 +395,6 @@ def test_join_list_series(float_frame): tm.assert_frame_equal(result, float_frame) -def test_suppress_future_warning_with_sort_kw(sort): - a = DataFrame({"col1": [1, 2]}, index=["c", "a"]) - b = DataFrame({"col2": [4, 5]}, index=["b", "a"]) - c = DataFrame({"col3": [7, 8]}, index=["a", "b"]) - - expected = DataFrame( - { - "col1": {"a": 2.0, "b": float("nan"), "c": 1.0}, - "col2": {"a": 5.0, "b": 4.0, "c": float("nan")}, - "col3": {"a": 7.0, "b": 8.0, "c": float("nan")}, - } - ) - result = a.join([b, c], how="outer", sort=sort) - tm.assert_frame_equal(result, expected) - - class TestDataFrameJoin: def test_join(self, multiindex_dataframe_random_data): frame = multiindex_dataframe_random_data From 187eaad117b11153455f1ec79d4bb139551e0f0c Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Nov 2025 13:55:53 -0400 Subject: [PATCH 4/5] GH# --- pandas/tests/reshape/merge/test_join.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5692cd6e267be..31df52645f3f9 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -673,6 +673,7 @@ def _check_diff_index(df_list, result, exp_index): @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) def test_join_many_sort_unique(self, how, sort): + # https://github.com/pandas-dev/pandas/pull/62954 df = DataFrame({"a": [1, 2, 3]}, index=[1, 0, 2]) df2 = DataFrame({"b": [4, 5, 6]}, index=[2, 0, 1]) if how == "right": @@ -687,6 +688,7 @@ def test_join_many_sort_unique(self, how, sort): @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) def test_join_many_sort_nonunique(self, how, sort): + # https://github.com/pandas-dev/pandas/pull/62954 df = DataFrame({"a": [1, 2, 3]}, index=[3, 0, 0]) df2 = DataFrame({"b": [4, 5, 6]}, index=[2, 0, 1]) if how == "inner": From 317fb9acc4bc13b168181375d35fd2b836ca3a55 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 1 Nov 2025 14:24:33 -0400 Subject: [PATCH 5/5] fixup --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 74f73486ef452..39e5f4af57ec1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1183,7 +1183,6 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- Bug in :func:`DataFrame.join` not producing the correct row order when joining with a list of Series/DataFrames (:issue:`62954) - Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`) - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) @@ -1191,6 +1190,7 @@ Reshaping - Bug in :meth:`DataFrame.combine` with non-unique columns incorrectly raising (:issue:`51340`) - Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) +- Bug in :meth:`DataFrame.join` not producing the correct row order when joining with a list of Series/DataFrames (:issue:`62954`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`Series.combine_first` incorrectly replacing ``None`` entries with ``NaN`` (:issue:`58977`)