From 2be5c6e31279a38ad0d9ff9bb6f0c00d943a9ec5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 31 Oct 2025 10:59:40 -0700 Subject: [PATCH] BUG: DataFrame.combine_first with non-unique columns --- doc/source/whatsnew/v3.0.0.rst | 2 ++ pandas/core/frame.py | 26 ++++++++++++------- pandas/tests/frame/methods/test_combine.py | 16 ++++++++++++ .../tests/frame/methods/test_combine_first.py | 12 +++++++++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b1dc78bbf8020..1228d5b6ce2f4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1184,6 +1184,8 @@ Reshaping - Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`) - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`) +- Bug in :meth:`DataFrame.combine_first` with non-unique columns incorrectly raising (:issue:`29135`) +- Bug in :meth:`DataFrame.combine` with non-unique columns incorrectly raising (:issue:`51340`) - Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6092368e2b944..5f62b5c07a5cf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9096,11 +9096,14 @@ def combine( # preserve column order new_columns = self.columns.union(other_columns, sort=False) + this = this.reindex(new_columns, axis=1) + other = other.reindex(new_columns, axis=1) + do_fill = fill_value is not None result = {} - for col in new_columns: - series = this[col] - other_series = other[col] + for i in range(this.shape[1]): + series = this.iloc[:, i] + other_series = other.iloc[:, i] this_dtype = series.dtype other_dtype = other_series.dtype @@ -9111,7 +9114,7 @@ def combine( # don't overwrite columns unnecessarily # DO propagate if this column is not in the intersection if not overwrite and other_mask.all(): - result[col] = this[col].copy() + result[i] = series.copy() continue if do_fill: @@ -9120,7 +9123,7 @@ def combine( series[this_mask] = fill_value other_series[other_mask] = fill_value - if col not in self.columns: + if new_columns[i] not in self.columns: # If self DataFrame does not have col in other DataFrame, # try to promote series, which is all NaN, as other_dtype. new_dtype = other_dtype @@ -9145,10 +9148,10 @@ def combine( arr, new_dtype ) - result[col] = arr + result[i] = arr - # convert_objects just in case - frame_result = self._constructor(result, index=new_index, columns=new_columns) + frame_result = self._constructor(result, index=new_index) + frame_result.columns = new_columns return frame_result.__finalize__(self, method="combine") def combine_first(self, other: DataFrame) -> DataFrame: @@ -9212,9 +9215,14 @@ def combiner(x: Series, y: Series): combined = self.combine(other, combiner, overwrite=False) dtypes = { + # Check for isinstance(..., (np.dtype, ExtensionDtype)) + # to prevent raising on non-unique columns see GH#29135. + # Note we will just not-cast in these cases. col: find_common_type([self.dtypes[col], other.dtypes[col]]) for col in self.columns.intersection(other.columns) - if combined.dtypes[col] != self.dtypes[col] + if isinstance(combined.dtypes[col], (np.dtype, ExtensionDtype)) + and isinstance(self.dtypes[col], (np.dtype, ExtensionDtype)) + and combined.dtypes[col] != self.dtypes[col] } if dtypes: diff --git a/pandas/tests/frame/methods/test_combine.py b/pandas/tests/frame/methods/test_combine.py index bc6a67e4e1f32..f7631f3a2adda 100644 --- a/pandas/tests/frame/methods/test_combine.py +++ b/pandas/tests/frame/methods/test_combine.py @@ -45,3 +45,19 @@ def test_combine_generic(self, float_frame): ) tm.assert_frame_equal(chunk, exp) tm.assert_frame_equal(chunk2, exp) + + def test_combine_nonunique_columns(self): + # GH#51340 + + df = pd.DataFrame({"A": range(5), "B": range(5)}) + df.columns = ["A", "A"] + + other = df.copy() + df.iloc[1, :] = None + + def combiner(a, b): + return b + + result = df.combine(other, combiner) + expected = other.astype("float64") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index e93684b4dc90f..e97bb2a98a390 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -413,6 +413,18 @@ def test_combine_first_preserve_EA_precision(self, wide_val, dtype): expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype) tm.assert_frame_equal(result, expected) + def test_combine_first_non_unique_columns(self): + # GH#29135 + df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"]) + df2 = DataFrame( + [[5, 6, 7], [8, 9, np.nan]], columns=["P", "Q", "Q"], index=["A", "B"] + ) + result = df1.combine_first(df2) + expected = DataFrame( + [[1, 6.0, 7.0], [3, 4.0, 4.0]], index=["A", "B"], columns=["P", "Q", "Q"] + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "scalar1, scalar2",