Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: improve 2D array access / transpose() for masked dtypes #52083

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ def __from_arrow__(
return BooleanArray._concat_same_type(results)


_boolean_dtype = BooleanDtype()


def coerce_to_array(
values, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
Expand Down Expand Up @@ -299,7 +302,7 @@ def __init__(
"values should be boolean numpy array. Use "
"the 'pd.array' function instead"
)
self._dtype = BooleanDtype()
self._dtype = _boolean_dtype
super().__init__(values, mask, copy=copy)

@property
Expand Down
41 changes: 36 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,10 @@
needs_i8_conversion,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
ExtensionDtype,
)
from pandas.core.dtypes.missing import (
isna,
notna,
Expand Down Expand Up @@ -2505,6 +2508,7 @@ def _from_arrays(
index,
dtype: Dtype | None = None,
verify_integrity: bool = True,
is_1d_ea_only: bool = False,
) -> Self:
"""
Create DataFrame from a list of arrays corresponding to the columns.
Expand Down Expand Up @@ -2544,6 +2548,7 @@ def _from_arrays(
dtype=dtype,
verify_integrity=verify_integrity,
typ=manager,
is_1d_ea_only=is_1d_ea_only,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

arrays_to_mgr already has a consolidate keyword. could we use that instead?

)
return cls(mgr)

Expand Down Expand Up @@ -3610,11 +3615,21 @@ def transpose(self, *args, copy: bool = False) -> DataFrame:
# We have EAs with the same dtype. We can preserve that dtype in transpose.
dtype = dtypes[0]
arr_type = dtype.construct_array_type()
values = self.values
if isinstance(dtype, BaseMaskedDtype):
data, mask = self._mgr.as_array_masked()
new_values = [arr_type(data[i], mask[i]) for i in range(self.shape[0])]
else:
values = self.values
new_values = [
arr_type._from_sequence(row, dtype=dtype) for row in values
]
Comment on lines +3618 to +3625
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is the main change (bringing the large part of the speed-up for transpose)


new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
result = type(self)._from_arrays(
new_values, index=self.columns, columns=self.index
new_values,
index=self.columns,
columns=self.index,
verify_integrity=False,
is_1d_ea_only=True,
)

else:
Expand Down Expand Up @@ -10935,7 +10950,23 @@ def _get_data() -> DataFrame:
df = _get_data()
if axis is None:
return func(df.values)
elif axis == 1:

# if len(df._mgr) > 0:
# common_dtype = find_common_type(list(df._mgr.get_dtypes()))
# is_masked_ea = isinstance(common_dtype, BaseMaskedDtype)
# is_np = isinstance(common_dtype, np.dtype)
# else:
# common_dtype = None

# if axis == 1 and common_dtype and is_masked_ea:
# data, mask = self._mgr.as_array_masked()
# ea2d = common_dtype.construct_array_type()(data, mask)
# result = ea2d._reduce(name, axis=axis, skipna=skipna, **kwds)
# labels = self._get_agg_axis(axis)
# result = self._constructor_sliced(result, index=labels, copy=False)
# return result
Comment on lines +10954 to +10967
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rhshadrach this relates to #51923, and making use of the as_array_masked added in this PR could be a potential different way of tackling it.

Manually running one of the benchmarks you mentioned in #51955 (the bottom row of - 8.94±0s 24.3±0.3ms 0.00 stat_ops.FrameOps.time_op('mean', 'Int64', 1)), with the above uncommented, I get:

In [1]: values = np.random.randn(100000, 4)
   ...: df = pd.DataFrame(values.astype(int)).astype("Int64")

In [2]: %timeit df.mean(axis=1)
5.97 s ± 434 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)   # <-- main
1.74 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)   # <-- PR

Now, this is probably a bit too optimistic, because the as_array_masked is currently assuming all the same dtypes, but with the use of find_common_type here, it would need to handle some extra cases as well. But in general it should also give a way to speed-up this specific case (while using the actual masked reduction implementations, which already support 2D with axis=1)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now, this is probably a bit too optimistic, because the as_array_masked is currently assuming all the same dtypes, but with the use of find_common_type here, it would need to handle some extra cases as well.

As long as we're not doing inference, the transpose is guaranteed to be a single dtype. This is because two different dtypes in the transpose could only arise from two different dtypes in the rows of the original. So we can determine the common dtype, cast to a single dtype, and then transpose.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing a cast beforehand might indeed be the easiest, then as_array_masked doesn't need to handle the case of multiple dtypes itself.


if axis == 1:
if len(df.index) == 0:
# Taking a transpose would result in no columns, losing the dtype.
# In the empty case, reducing along axis 0 or 1 gives the same
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def arrays_to_mgr(
verify_integrity: bool = True,
typ: str | None = None,
consolidate: bool = True,
is_1d_ea_only: bool = False,
) -> Manager:
"""
Segregate Series based on type and coerce into matrices.
Expand All @@ -127,7 +128,8 @@ def arrays_to_mgr(

else:
index = ensure_index(index)
arrays = [extract_array(x, extract_numpy=True) for x in arrays]
if not is_1d_ea_only:
arrays = [extract_array(x, extract_numpy=True) for x in arrays]
# with _from_arrays, the passed arrays should never be Series objects
refs = [None] * len(arrays)

Expand All @@ -152,7 +154,7 @@ def arrays_to_mgr(

if typ == "block":
return create_block_manager_from_column_arrays(
arrays, axes, consolidate=consolidate, refs=refs
arrays, axes, consolidate, refs=refs, is_1d_ea_only=is_1d_ea_only
)
elif typ == "array":
return ArrayManager(arrays, [index, columns])
Expand Down
47 changes: 45 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,40 @@ def as_array(

return arr.transpose()

def as_array_masked(self) -> np.ndarray:
"""
Convert the blockmanager data into an numpy array.

Returns
-------
arr : ndarray
"""
# # TODO(CoW) handle case where resulting array is a view
# if len(self.blocks) == 0:
# arr = np.empty(self.shape, dtype=float)
# return arr.transpose()
Comment on lines +1758 to +1761
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorisvandenbossche - I don't follow this todo - can you elaborate?


# TODO we already established we only have a single dtype, but this
# could be generalized to be a mix of all masked dtypes
dtype = self.blocks[0].dtype.numpy_dtype

result_data = np.empty(self.shape, dtype=dtype)
result_mask = np.empty(self.shape, dtype="bool")

itemmask = np.zeros(self.shape[0])

for blk in self.blocks:
rl = blk.mgr_locs
arr = blk.values
result_data[rl.indexer] = arr._data
result_mask[rl.indexer] = arr._mask
itemmask[rl.indexer] = 1

if not itemmask.all():
raise AssertionError("Some items were not contained in blocks")

return result_data.transpose(), result_mask.transpose()

def _interleave(
self,
dtype: np.dtype | None = None,
Expand Down Expand Up @@ -2130,6 +2164,7 @@ def create_block_manager_from_column_arrays(
axes: list[Index],
consolidate: bool,
refs: list,
is_1d_ea_only: bool = False,
) -> BlockManager:
# Assertions disabled for performance (caller is responsible for verifying)
# assert isinstance(axes, list)
Expand All @@ -2143,7 +2178,7 @@ def create_block_manager_from_column_arrays(
# verify_integrity=False below.

try:
blocks = _form_blocks(arrays, consolidate, refs)
blocks = _form_blocks(arrays, consolidate, refs, is_1d_ea_only)
mgr = BlockManager(blocks, axes, verify_integrity=False)
except ValueError as e:
raise_construction_error(len(arrays), arrays[0].shape, axes, e)
Expand Down Expand Up @@ -2197,9 +2232,17 @@ def _grouping_func(tup: tuple[int, ArrayLike]) -> tuple[int, bool, DtypeObj]:
return sep, isinstance(dtype, np.dtype), dtype


def _form_blocks(arrays: list[ArrayLike], consolidate: bool, refs: list) -> list[Block]:
def _form_blocks(
arrays: list[ArrayLike], consolidate: bool, refs: list, is_1d_ea_only: bool
) -> list[Block]:
tuples = list(enumerate(arrays))

if is_1d_ea_only:
block_type = get_block_type(arrays[0].dtype)
return [
block_type(arr, placement=BlockPlacement(i), ndim=2) for i, arr in tuples
]

if not consolidate:
nbs = _tuples_to_blocks_no_consolidate(tuples, refs)
return nbs
Expand Down