Skip to content

Commit

Permalink
Backport PR #52591 on branch 2.0.x (BUG: pd.array raising with NumPy …
Browse files Browse the repository at this point in the history
…array and large dtype) (#52951)

Backport PR #52591: BUG: pd.array raising with NumPy array and large dtype

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl committed Apr 27, 2023
1 parent cab4cf4 commit 91757c5
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Bug fixes
~~~~~~~~~
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`)
-

.. ---------------------------------------------------------------------------
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,16 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal
Construct a new ExtensionArray from a sequence of scalars.
"""
pa_dtype = to_pyarrow_type(dtype)
if (
isinstance(scalars, np.ndarray)
and isinstance(dtype, ArrowDtype)
and (
pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)
)
):
# See https://github.com/apache/arrow/issues/35289
scalars = scalars.tolist()

if isinstance(scalars, cls):
scalars = scalars._data
elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2620,6 +2620,20 @@ def test_setitem_boolean_replace_with_mask_segfault():
assert arr._data == expected._data


@pytest.mark.parametrize(
"data, arrow_dtype",
[
([b"a", b"b"], pa.large_binary()),
(["a", "b"], pa.large_string()),
],
)
def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype):
dtype = ArrowDtype(arrow_dtype)
result = pd.array(np.array(data), dtype=dtype)
expected = pd.array(data, dtype=dtype)
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES)
def test_describe_numeric_data(pa_type):
# GH 52470
Expand Down

0 comments on commit 91757c5

Please sign in to comment.