Skip to content

Commit

Permalink
Backport PR #52422 on branch 2.0.x (BUG: merge with arrow and numpy d…
Browse files Browse the repository at this point in the history
…types raises) (#52480)

Backport PR #52422: BUG: merge with arrow and numpy dtypes raises

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl committed Apr 6, 2023
1 parent b9b06b3 commit 33670f5
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Fixed regressions

Bug fixes
~~~~~~~~~
-
- Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`)

.. ---------------------------------------------------------------------------
.. _whatsnew_201.other:
Expand Down
20 changes: 19 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,15 @@
)

from pandas import (
ArrowDtype,
Categorical,
Index,
MultiIndex,
Series,
)
import pandas.core.algorithms as algos
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
ExtensionArray,
)
Expand Down Expand Up @@ -2372,7 +2374,11 @@ def _factorize_keys(
rk = ensure_int64(rk.codes)

elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):
if not isinstance(lk, BaseMaskedArray):
if not isinstance(lk, BaseMaskedArray) and not (
# exclude arrow dtypes that would get cast to object
isinstance(lk.dtype, ArrowDtype)
and is_numeric_dtype(lk.dtype.numpy_dtype)
):
lk, _ = lk._values_for_factorize()

# error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
Expand All @@ -2387,6 +2393,16 @@ def _factorize_keys(
assert isinstance(rk, BaseMaskedArray)
llab = rizer.factorize(lk._data, mask=lk._mask)
rlab = rizer.factorize(rk._data, mask=rk._mask)
elif isinstance(lk, ArrowExtensionArray):
assert isinstance(rk, ArrowExtensionArray)
# we can only get here with numeric dtypes
# TODO: Remove when we have a Factorizer for Arrow
llab = rizer.factorize(
lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
)
rlab = rizer.factorize(
rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
)
else:
# Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
# "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
Expand Down Expand Up @@ -2445,6 +2461,8 @@ def _convert_arrays_and_get_rizer_klass(
# Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
# expected type "Type[object]"
klass = _factorizers[lk.dtype.type] # type: ignore[index]
elif isinstance(lk.dtype, ArrowDtype):
klass = _factorizers[lk.dtype.numpy_dtype.type]
else:
klass = _factorizers[lk.dtype.type]

Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2735,3 +2735,18 @@ def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type):
}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["int64", "int64[pyarrow]"])
def test_merge_arrow_and_numpy_dtypes(dtype):
# GH#52406
pytest.importorskip("pyarrow")
df = DataFrame({"a": [1, 2]}, dtype=dtype)
df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]")
result = df.merge(df2)
expected = df.copy()
tm.assert_frame_equal(result, expected)

result = df2.merge(df)
expected = df2.copy()
tm.assert_frame_equal(result, expected)

0 comments on commit 33670f5

Please sign in to comment.