Skip to content

Commit

Permalink
Backport PR #47762 on branch 1.4.x (REGR: preserve reindexed array ob…
Browse files Browse the repository at this point in the history
…ject (instead of creating new array) for concat with all-NA array) (#48308)

Backport PR #47762: REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
meeseeksmachine and jorisvandenbossche committed Aug 30, 2022
1 parent c40c48c commit bc82815
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 10 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.4.rst
Expand Up @@ -17,6 +17,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.fillna` not working :class:`DataFrame` with :class:`MultiIndex` (:issue:`47649`)
- Fixed regression in taking NULL :class:`objects` from a :class:`DataFrame` causing a segmentation violation. These NULL values are created by :meth:`numpy.empty_like` (:issue:`46848`)
- Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`)
- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes (:issue:`47762`)
- Fixed regression in calling bitwise numpy ufuncs (for example, ``np.bitwise_and``) on Index objects (:issue:`46769`)
- Fixed regression in :func:`cut` using a ``datetime64`` IntervalIndex as bins (:issue:`46218`)
- Fixed regression in :meth:`DataFrame.select_dtypes` where ``include="number"`` included :class:`BooleanDtype` (:issue:`46870`)
Expand Down
25 changes: 15 additions & 10 deletions pandas/core/internals/concat.py
Expand Up @@ -477,16 +477,21 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
return DatetimeArray(i8values, dtype=empty_dtype)

elif is_1d_only_ea_dtype(empty_dtype):
empty_dtype = cast(ExtensionDtype, empty_dtype)
cls = empty_dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=empty_dtype)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
if is_dtype_equal(blk_dtype, empty_dtype) and self.indexers:
# avoid creating new empty array if we already have an array
# with correct dtype that can be reindexed
pass
else:
empty_dtype = cast(ExtensionDtype, empty_dtype)
cls = empty_dtype.construct_array_type()

missing_arr = cls._from_sequence([], dtype=empty_dtype)
ncols, nrows = self.shape
assert ncols == 1, ncols
empty_arr = -1 * np.ones((nrows,), dtype=np.intp)
return missing_arr.take(
empty_arr, allow_fill=True, fill_value=fill_value
)
elif isinstance(empty_dtype, ExtensionDtype):
# TODO: no tests get here, a handful would if we disabled
# the dt64tz special-case above (which is faster)
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/extension/array_with_attr/__init__.py
@@ -0,0 +1,6 @@
from pandas.tests.extension.array_with_attr.array import (
FloatAttrArray,
FloatAttrDtype,
)

__all__ = ["FloatAttrArray", "FloatAttrDtype"]
84 changes: 84 additions & 0 deletions pandas/tests/extension/array_with_attr/array.py
@@ -0,0 +1,84 @@
"""
Test extension array that has custom attribute information (not stored on the dtype).
"""
from __future__ import annotations

import numbers

import numpy as np

from pandas._typing import type_t

from pandas.core.dtypes.base import ExtensionDtype

import pandas as pd
from pandas.core.arrays import ExtensionArray


class FloatAttrDtype(ExtensionDtype):
type = float
name = "float_attr"
na_value = np.nan

@classmethod
def construct_array_type(cls) -> type_t[FloatAttrArray]:
"""
Return the array type associated with this dtype.
Returns
-------
type
"""
return FloatAttrArray


class FloatAttrArray(ExtensionArray):
dtype = FloatAttrDtype()
__array_priority__ = 1000

def __init__(self, values, attr=None) -> None:
if not isinstance(values, np.ndarray):
raise TypeError("Need to pass a numpy array of float64 dtype as values")
if not values.dtype == "float64":
raise TypeError("Need to pass a numpy array of float64 dtype as values")
self.data = values
self.attr = attr

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
data = np.array(scalars, dtype="float64", copy=copy)
return cls(data)

def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
else:
# slice, list-like, mask
item = pd.api.indexers.check_array_indexer(self, item)
return type(self)(self.data[item], self.attr)

def __len__(self) -> int:
return len(self.data)

def isna(self):
return np.isnan(self.data)

def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take

data = self.data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value

result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
return type(self)(result, self.attr)

def copy(self):
return type(self)(self.data.copy(), self.attr)

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x.data for x in to_concat])
attr = to_concat[0].attr if len(to_concat) else None
return cls(data, attr)
33 changes: 33 additions & 0 deletions pandas/tests/extension/array_with_attr/test_array_with_attr.py
@@ -0,0 +1,33 @@
import numpy as np

import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.array_with_attr import FloatAttrArray


def test_concat_with_all_na():
# https://github.com/pandas-dev/pandas/pull/47762
# ensure that attribute of the column array is preserved (when it gets
# preserved in reindexing the array) during merge/concat
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")

df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"

df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
df2 = pd.DataFrame({"key": [0, 2], "col2": [1, 2]})
result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame({"col": arr.take([0]), "key": [0], "col2": [1]})
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"

result = pd.concat([df1.set_index("key"), df2.set_index("key")], axis=1)
expected = pd.DataFrame(
{"col": arr.take([0, 1, -1]), "col2": [1, np.nan, 2], "key": [0, 1, 2]}
).set_index("key")
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"

0 comments on commit bc82815

Please sign in to comment.