Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: __from_arrow__ doesn't accept pyarrow null arrays for numeric ma… #52223

Merged
merged 6 commits into from Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Expand Up @@ -1208,6 +1208,7 @@ Numeric
- Bug in :meth:`~arrays.ArrowExtensionArray.mode` where ``dropna=False`` was not respected when there was ``NA`` values (:issue:`50982`)
- Bug in :meth:`DataFrame.query` with ``engine="numexpr"`` and column names are ``min`` or ``max`` would raise a ``TypeError`` (:issue:`50937`)
- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with tz-aware data containing ``pd.NaT`` and ``axis=1`` would return incorrect results (:issue:`51242`)
- Bug where the ``__from_arrow__`` method of masked ExtensionDtypes(e.g. :class:`Float64Dtype`, :class:`BooleanDtype`) would not accept pyarrow arrays of type ``pyarrow.null()`` (:issue:`52223`)

Conversion
^^^^^^^^^^
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/arrays/arrow/_arrow_utils.py
Expand Up @@ -42,6 +42,11 @@ def pyarrow_array_to_numpy_and_mask(
"""
dtype = np.dtype(dtype)

if pyarrow.types.is_null(arr.type):
# No initialization of data is needed since everything is null
data = np.empty(len(arr), dtype=dtype)
mask = np.zeros(len(arr), dtype=bool)
return data, mask
buflist = arr.buffers()
# Since Arrow buffers might contain padding and the data might be offset,
# the buffer gets sliced here before handing it to numpy.
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/arrays/boolean.py
Expand Up @@ -108,7 +108,7 @@ def __from_arrow__(
"""
import pyarrow

if array.type != pyarrow.bool_():
if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
raise TypeError(f"Expected array of boolean type, got {array.type} instead")

if isinstance(array, pyarrow.Array):
Expand All @@ -119,6 +119,13 @@ def __from_arrow__(

results = []
for arr in chunks:
if pyarrow.types.is_null(arr.type):
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
mask = np.ones(len(arr), dtype=bool)
# No need to init data, since all null
data = np.empty(len(arr), dtype=bool)
bool_arr = BooleanArray(data, mask)
results.append(bool_arr)
continue
buflist = arr.buffers()
data = pyarrow.BooleanArray.from_buffers(
arr.type, len(arr), [None, buflist[1]], offset=arr.offset
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/arrays/numeric.py
Expand Up @@ -76,7 +76,9 @@ def __from_arrow__(
array_class = self.construct_array_type()

pyarrow_type = pyarrow.from_numpy_dtype(self.type)
if not array.type.equals(pyarrow_type):
if not array.type.equals(pyarrow_type) and not pyarrow.types.is_null(
array.type
):
# test_from_arrow_type_error raise for string, but allow
# through itemsize conversion GH#31896
rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/arrays/masked/test_arrow_compat.py
Expand Up @@ -184,6 +184,13 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays):
tm.assert_numpy_array_equal(mask, mask_expected_empty)


def test_from_arrow_null(data):
lithomas1 marked this conversation as resolved.
Show resolved Hide resolved
arr = pa.nulls(10)
res = data.dtype.__from_arrow__(arr)
assert res.isna().all()
assert len(res) == 10


def test_from_arrow_type_error(data):
# ensure that __from_arrow__ returns a TypeError when getting a wrong
# array type
Expand Down