From adeae840585a137528ba6d8d69ffb923c2f6080c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 28 Oct 2025 23:44:56 +0000 Subject: [PATCH 1/2] PERF: Avoid deep copies when casting dtypes in merge --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/merge.py | 8 ++++---- pandas/tests/copy_view/test_functions.py | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 75b4c5c0fe14d..a6ce88dcc44e2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -937,6 +937,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :func:`merge` when join keys have different dtypes and need to be upcast (:issue:`60545`) - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 6516c843d637e..84cf81857cd52 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1186,8 +1186,8 @@ def _indicator_pre_merge( "Cannot use name of an existing column for indicator column" ) - left = left.copy() - right = right.copy() + left = left.copy(deep=False) + right = right.copy(deep=False) left["_left_indicator"] = 1 left["_left_indicator"] = left["_left_indicator"].astype("int8") @@ -1865,11 +1865,11 @@ def _maybe_coerce_merge_keys(self) -> None: # incompatible dtypes. See GH 16900. if name in self.left.columns: typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object - self.left = self.left.copy() + self.left = self.left.copy(deep=False) self.left[name] = self.left[name].astype(typ) if name in self.right.columns: typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object - self.right = self.right.copy() + self.right = self.right.copy(deep=False) self.right[name] = self.right[name].astype(typ) def _validate_left_right_on(self, left_on, right_on): diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index d23263835c615..7ee1e45d1fea4 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -243,6 +243,29 @@ def test_merge_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) +def test_merge_upcasting_no_copy(): + left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + left_copy = left.copy() + right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]}, dtype=object) + result = merge(left, right, on="a") + assert np.shares_memory(get_array(result, "b"), get_array(left, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(left, "a")) + tm.assert_frame_equal(left, left_copy) + + result = merge(right, left, on="a") + assert np.shares_memory(get_array(result, "b"), get_array(left, "b")) + assert not np.shares_memory(get_array(result, "a"), get_array(left, "a")) + tm.assert_frame_equal(left, left_copy) + + +def test_merge_indicator_no_deep_copy(): + left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]}) + result = merge(left, right, on="a", indicator=True) + assert np.shares_memory(get_array(result, "b"), get_array(left, "b")) + assert np.shares_memory(get_array(result, "c"), get_array(right, "c")) + + @pytest.mark.parametrize("dtype", [object, "str"]) def test_join_on_key(dtype): df_index = Index(["a", "b", "c"], name="key", dtype=dtype) From 739952c619c085750d65c1c5492aad8a3a411843 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 28 Oct 2025 23:45:44 +0000 Subject: [PATCH 2/2] Update v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a6ce88dcc44e2..06df616f0b0a2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -937,7 +937,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) -- Performance improvement in :func:`merge` when join keys have different dtypes and need to be upcast (:issue:`60545`) +- Performance improvement in :func:`merge` when join keys have different dtypes and need to be upcast (:issue:`62902`) - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`DataFrame.__getitem__` when ``key`` is a :class:`DataFrame` with many columns (:issue:`61010`) - Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)