From 5e4a066ace3397704429025f78aa8467551303b9 Mon Sep 17 00:00:00 2001 From: Vijay Sarathy Date: Sun, 19 Oct 2025 16:28:32 -0400 Subject: [PATCH 1/6] Fixed DataFrame.combine with non-unique columns --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/frame.py | 6 +++--- pandas/tests/frame/methods/test_combine.py | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 2708e69d01224..cfc10485f3dad 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1234,12 +1234,12 @@ Other - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) - Deprecated the keyword ``check_datetimelike_compat`` in :meth:`testing.assert_frame_equal` and :meth:`testing.assert_series_equal` (:issue:`55638`) +- Fixed bug in :meth:`DataFrame.combine` with non-unique columns (:issue:`51340`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when trying to replace :class:`NA` values in a :class:`Float64Dtype` object with ``np.nan``; this now works with ``pd.set_option("mode.nan_is_na", False)`` and is irrelevant otherwise (:issue:`55127`) - Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when trying to replace :class:`np.nan` values in a :class:`Int64Dtype` object with :class:`NA`; this is now a no-op with ``pd.set_option("mode.nan_is_na", False)`` and is irrelevant otherwise (:issue:`51237`) - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) - .. ***DO NOT USE THIS SECTION*** - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c41b82bbbc8e..f8d6366180a98 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9041,9 +9041,9 @@ def combine( new_columns = self.columns.union(other_columns, sort=False) do_fill = fill_value is not None result = {} - for col in new_columns: - series = this[col] - other_series = other[col] + for i, col in enumerate(new_columns): + series = this.iloc[:, i] + other_series = other.iloc[:, i] this_dtype = series.dtype other_dtype = other_series.dtype diff --git a/pandas/tests/frame/methods/test_combine.py b/pandas/tests/frame/methods/test_combine.py index bc6a67e4e1f32..abc8fda8fb88d 100644 --- a/pandas/tests/frame/methods/test_combine.py +++ b/pandas/tests/frame/methods/test_combine.py @@ -45,3 +45,17 @@ def test_combine_generic(self, float_frame): ) tm.assert_frame_equal(chunk, exp) tm.assert_frame_equal(chunk2, exp) + + def test_combine_non_unique_columns(self): + # GH#51340 + df = pd.DataFrame({"A": range(5), "B": range(5)}) + df.columns = ["A", "A"] + + other = df.copy() + df.iloc[1, :] = 11 + + def combiner(a, b): + return b + + result = df.combine(other, combiner) + tm.assert_frame_equal(result, other) From dcac1a3a21073471f9c926b44aa3ef529d0e3c02 Mon Sep 17 00:00:00 2001 From: Vijay Sarathy Date: Fri, 24 Oct 2025 17:51:03 -0400 Subject: [PATCH 2/6] Re-doing fix for Dataframe combine that works with duplicate column names. --- pandas/core/frame.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f8d6366180a98..bd846364f8b30 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9037,13 +9037,26 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() + def rename_duplicates(columns: Index) -> Index: + seen = {} + for col in columns: + if col in seen: + col = col + f".{seen[col]}" + seen[col] = seen.get(col, 0) + 1 + return columns + + new_columns_out = self.columns.union(other_columns, sort=False) + self_columns, other_columns = ( + rename_duplicates(self.columns), + rename_duplicates(other_columns), + ) # preserve column order - new_columns = self.columns.union(other_columns, sort=False) + new_columns_unique = self_columns.union(other_columns, sort=False) do_fill = fill_value is not None result = {} - for i, col in enumerate(new_columns): - series = this.iloc[:, i] - other_series = other.iloc[:, i] + for col in new_columns_unique: + series = this[col] + other_series = other[col] this_dtype = series.dtype other_dtype = other_series.dtype @@ -9091,7 +9104,9 @@ def combine( result[col] = arr # convert_objects just in case - frame_result = self._constructor(result, index=new_index, columns=new_columns) + frame_result = self._constructor( + result, index=new_index, columns=new_columns_out + ) return frame_result.__finalize__(self, method="combine") def combine_first(self, other: DataFrame) -> DataFrame: From 6507206d2eb6264b6b6166847c7c8843dfa74c14 Mon Sep 17 00:00:00 2001 From: Vijay Sarathy Date: Mon, 27 Oct 2025 16:53:55 -0400 Subject: [PATCH 3/6] Refactored to use dedup_names from pandas.io.common. --- pandas/core/frame.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bd846364f8b30..ddcfa8501a88b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -194,7 +194,10 @@ nargsort, ) -from pandas.io.common import get_handle +from pandas.io.common import ( + dedup_names, + get_handle, +) from pandas.io.formats import ( console, format as fmt, @@ -9026,7 +9029,7 @@ def combine( 2 NaN 3.0 1.0 """ other_idxlen = len(other.index) # save for compare - other_columns = other.columns + self_columns, other_columns = self.columns, other.columns this, other = self.align(other) new_index = this.index @@ -9037,19 +9040,17 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - def rename_duplicates(columns: Index) -> Index: - seen = {} - for col in columns: - if col in seen: - col = col + f".{seen[col]}" - seen[col] = seen.get(col, 0) + 1 - return columns - new_columns_out = self.columns.union(other_columns, sort=False) - self_columns, other_columns = ( - rename_duplicates(self.columns), - rename_duplicates(other_columns), + # Deduplicate column names if necessary + self_columns = Index(dedup_names(self_columns, False), dtype=self_columns.dtype) + other_columns = Index( + dedup_names(other_columns, False), dtype=other_columns.dtype ) + this.columns = Index(dedup_names(this.columns, False), dtype=this.columns.dtype) + other.columns = Index( + dedup_names(other.columns, False), dtype=other.columns.dtype + ) + # preserve column order new_columns_unique = self_columns.union(other_columns, sort=False) do_fill = fill_value is not None From 36ed5ecdffa8c1d3269d763f7160eaa961b364de Mon Sep 17 00:00:00 2001 From: Vijay Sarathy Date: Mon, 27 Oct 2025 19:15:22 -0400 Subject: [PATCH 4/6] Minor fixes to pass a couple build tests. --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ee4ff55adb805..f0e6fc3f896a5 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1266,6 +1266,7 @@ Other - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) + .. ***DO NOT USE THIS SECTION*** - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c75ce72ae596c..0c141c0c66f15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9109,13 +9109,17 @@ def combine( new_columns_out = self.columns.union(other_columns, sort=False) # Deduplicate column names if necessary - self_columns = Index(dedup_names(self_columns, False), dtype=self_columns.dtype) + self_columns = Index( + dedup_names(list(self_columns), False), dtype=self_columns.dtype + ) other_columns = Index( - dedup_names(other_columns, False), dtype=other_columns.dtype + dedup_names(list(other_columns), False), dtype=other_columns.dtype + ) + this.columns = Index( + dedup_names(list(this.columns), False), dtype=this.columns.dtype ) - this.columns = Index(dedup_names(this.columns, False), dtype=this.columns.dtype) other.columns = Index( - dedup_names(other.columns, False), dtype=other.columns.dtype + dedup_names(list(other.columns), False), dtype=other.columns.dtype ) # preserve column order From b553d6927edab885ec2a90996cdd51a1aee2d24d Mon Sep 17 00:00:00 2001 From: Vijay Sarathy Date: Fri, 31 Oct 2025 11:08:35 -0400 Subject: [PATCH 5/6] Remove dedup_names and use column indices instead of names in DataFrame.combine. --- pandas/core/frame.py | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0c141c0c66f15..82403aa55e534 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -195,7 +195,6 @@ ) from pandas.io.common import ( - dedup_names, get_handle, ) from pandas.io.formats import ( @@ -9107,28 +9106,13 @@ def combine( if self.empty and len(other) == other_idxlen: return other.copy() - new_columns_out = self.columns.union(other_columns, sort=False) - # Deduplicate column names if necessary - self_columns = Index( - dedup_names(list(self_columns), False), dtype=self_columns.dtype - ) - other_columns = Index( - dedup_names(list(other_columns), False), dtype=other_columns.dtype - ) - this.columns = Index( - dedup_names(list(this.columns), False), dtype=this.columns.dtype - ) - other.columns = Index( - dedup_names(list(other.columns), False), dtype=other.columns.dtype - ) - # preserve column order - new_columns_unique = self_columns.union(other_columns, sort=False) + new_columns = self_columns.union(other_columns, sort=False) do_fill = fill_value is not None result = {} - for col in new_columns_unique: - series = this[col] - other_series = other[col] + for i in range(this.shape[1]): + series = this.iloc[:, i] + other_series = other.iloc[:, i] this_dtype = series.dtype other_dtype = other_series.dtype @@ -9139,7 +9123,7 @@ def combine( # don't overwrite columns unnecessarily # DO propagate if this column is not in the intersection if not overwrite and other_mask.all(): - result[col] = this[col].copy() + result.iloc[:, i] = this.iloc[:, i].copy() continue if do_fill: @@ -9148,7 +9132,7 @@ def combine( series[this_mask] = fill_value other_series[other_mask] = fill_value - if col not in self.columns: + if other.columns[i] not in self.columns: # If self DataFrame does not have col in other DataFrame, # try to promote series, which is all NaN, as other_dtype. new_dtype = other_dtype @@ -9173,12 +9157,10 @@ def combine( arr, new_dtype ) - result[col] = arr + result[new_columns[i]] = arr # convert_objects just in case - frame_result = self._constructor( - result, index=new_index, columns=new_columns_out - ) + frame_result = self._constructor(result, index=new_index, columns=new_columns) return frame_result.__finalize__(self, method="combine") def combine_first(self, other: DataFrame) -> DataFrame: From 51a6455e638c72e21d48df2c58e99d0880a2ba1a Mon Sep 17 00:00:00 2001 From: Vijay Sarathy Date: Fri, 31 Oct 2025 12:16:53 -0400 Subject: [PATCH 6/6] Fix logic for column access in result dictionary in DataFrame.combine. --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 82403aa55e534..fcd4d6aaf1195 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9123,7 +9123,7 @@ def combine( # don't overwrite columns unnecessarily # DO propagate if this column is not in the intersection if not overwrite and other_mask.all(): - result.iloc[:, i] = this.iloc[:, i].copy() + result[this.columns[i]] = this.iloc[:, i].copy() continue if do_fill: @@ -9157,7 +9157,7 @@ def combine( arr, new_dtype ) - result[new_columns[i]] = arr + result[this.columns[i]] = arr # convert_objects just in case frame_result = self._constructor(result, index=new_index, columns=new_columns)