From a8cce286a82f88d4d06ae77450a837154d4ecd23 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 25 Oct 2025 13:03:32 +0100 Subject: [PATCH] Improve error reporting when merge validation fails --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/merge.py | 30 ++++++++++++++++++++---- pandas/tests/reshape/merge/test_merge.py | 20 ++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1d6fe2ae32071..75b4c5c0fe14d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -219,6 +219,7 @@ Other enhancements - Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) +- Improve error reporting through outputting the first few duplicates when :func:`merge` validation fails (:issue:`62742`) - Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`) - Improved deprecation message for offset aliases (:issue:`60820`) - Many type aliases are now exposed in the new submodule :py:mod:`pandas.api.typing.aliases` (:issue:`55231`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 604181214ad44..6516c843d637e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1954,42 +1954,62 @@ def _validate_left_right_on(self, left_on, right_on): def _validate_validate_kwd(self, validate: str) -> None: # Check uniqueness of each if self.left_index: - left_unique = self.orig_left.index.is_unique + left_join_index = self.orig_left.index + left_unique = left_join_index.is_unique else: - left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique + left_join_index = MultiIndex.from_arrays(self.left_join_keys) + left_unique = left_join_index.is_unique if self.right_index: + right_join_index = self.orig_right.index right_unique = self.orig_right.index.is_unique else: - right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique + right_join_index = MultiIndex.from_arrays(self.right_join_keys) + right_unique = right_join_index.is_unique + + def left_error_msg(x: Index) -> str: + name = self.left_on if not self.left_index else lib.no_default + msg = x[x.duplicated()][:5].to_frame(name=name).to_string(index=False) + return f"\nDuplicates in left:\n {msg} ..." + + def right_error_msg(x: Index) -> str: + name = self.right_on if not self.right_index else lib.no_default + msg = x[x.duplicated()][:5].to_frame(name=name).to_string(index=False) + return f"\nDuplicates in right:\n {msg} ..." # Check data integrity if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: raise MergeError( "Merge keys are not unique in either left " - "or right dataset; not a one-to-one merge" + "or right dataset; not a one-to-one merge." + f"{left_error_msg(left_join_index)}" + f"{right_error_msg(right_join_index)}" ) if not left_unique: raise MergeError( "Merge keys are not unique in left dataset; not a one-to-one merge" + f"{left_error_msg(left_join_index)}" ) if not right_unique: raise MergeError( "Merge keys are not unique in right dataset; not a one-to-one merge" + f"{right_error_msg(right_join_index)}" ) elif validate in ["one_to_many", "1:m"]: if not left_unique: raise MergeError( "Merge keys are not unique in left dataset; not a one-to-many merge" + f"{left_error_msg(left_join_index)}" ) elif validate in ["many_to_one", "m:1"]: if not right_unique: raise MergeError( "Merge keys are not unique in right dataset; " - "not a many-to-one merge" + "not a many-to-one merge\n" + f"{right_error_msg(right_join_index)}" ) elif validate in ["many_to_many", "m:m"]: diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index c38ee32cb7226..d188cb396a7da 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1325,6 +1325,26 @@ def test_validation(self): result = merge(left, right, on=["a", "b"], validate="1:1") tm.assert_frame_equal(result, expected_multi) + def test_merge_validate_error_message(self): + # GH#62742 + left = DataFrame({"key": [1, 1, 2]}) + right = DataFrame({"key": [1, 2, 2]}) + + with pytest.raises(MergeError, match="Duplicates in left:\n key\n 1 ...\n"): + merge(left, right, validate="1:1") + with pytest.raises(MergeError, match="Duplicates in left:\n key\n 1 ..."): + merge(left, right, validate="1:m") + with pytest.raises(MergeError, match="Duplicates in right:\n key\n 2 ..."): + merge(left, right, validate="1:1") + with pytest.raises(MergeError, match="Duplicates in right:\n key\n 2 ..."): + merge(left, right, validate="m:1") + + right = DataFrame({"key": [1, 2, 3]}) + with pytest.raises(MergeError, match="Duplicates in left:\n key\n 1 ..."): + merge(left, right, validate="1:1") + with pytest.raises(MergeError, match="Duplicates in right:\n key\n 1 ..."): + merge(right, left, validate="1:1") + def test_merge_two_empty_df_no_division_error(self): # GH17776, PR #17846 a = DataFrame({"a": [], "b": [], "c": []})