diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 6a85bfd852e195..659981d59e4a75 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -231,6 +231,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`) - Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`) - Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`) +- Bug in :func:`DataFrame` constructor unnecessarily copying 2D object arrays (:issue:`39263`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 57a87e1e283d99..8db101183e60cc 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -37,7 +37,6 @@ is_integer_dtype, is_list_like, is_named_tuple, - is_object_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -59,7 +58,7 @@ ) from pandas.core.internals.managers import ( create_block_manager_from_arrays, - create_block_manager_from_blocks, + create_block_manager_from_array ) if TYPE_CHECKING: @@ -232,34 +231,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): ) values = values.T - # if we don't have a dtype specified, then try to convert objects - # on the entire block; this is to convert if we have datetimelike's - # embedded in an object type - if dtype is None and is_object_dtype(values.dtype): - - if values.ndim == 2 and values.shape[0] != 1: - # transpose and separate blocks - - dvals_list = [maybe_infer_to_datetimelike(row) for row in values] - for n in range(len(dvals_list)): - if isinstance(dvals_list[n], np.ndarray): - dvals_list[n] = dvals_list[n].reshape(1, -1) - - from pandas.core.internals.blocks import make_block - - # TODO: What about re-joining object columns? - block_values = [ - make_block(dvals_list[n], placement=[n], ndim=2) - for n in range(len(dvals_list)) - ] - - else: - datelike_vals = maybe_infer_to_datetimelike(values) - block_values = [datelike_vals] - else: - block_values = [values] - - return create_block_manager_from_blocks(block_values, [columns, index]) + return create_block_manager_from_array(values, [columns, index], dtype) def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ad9cdcfa1b07f7..c3b8e66865eadf 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -28,6 +28,7 @@ find_common_type, infer_dtype_from_scalar, maybe_promote, + maybe_infer_to_datetimelike ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -1679,6 +1680,30 @@ def create_block_manager_from_arrays( raise construction_error(len(arrays), arrays[0].shape, axes, e) +def create_block_manager_from_array( + array, axes: List[Index], dtype: Optional[Dtype] = None +) -> BlockManager: + assert isinstance(axes, list) + assert all(isinstance(x, Index) for x in axes) + + # ensure we dont have any PandasArrays when we call get_block_type + # Note: just calling extract_array breaks tests that patch PandasArray._typ. + array = array if not isinstance(array, ABCPandasArray) else array.to_numpy() + + maybe_datetime = [maybe_infer_to_datetimelike(instance) for instance in array] + try: + if dtype is not None or all(is_dtype_equal(instance.dtype, array.dtype) + for instance in maybe_datetime): + blocks = [make_block(array, slice(0, len(axes[0])), dtype=dtype)] + else: + blocks = _form_blocks(maybe_datetime, axes[0], axes) + mgr = BlockManager(blocks, axes) + mgr._consolidate_inplace() + return mgr + except ValueError as e: + raise construction_error(array.shape[0], array.shape[1:], axes, e) + + def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ passed = tuple(map(int, [tot_items] + list(block_shape))) @@ -1706,6 +1731,11 @@ def construction_error(tot_items, block_shape, axes, e=None): def _form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? + + if len(arrays) != len(names): + raise ValueError(f"Number of arrays ({len(arrays)}) " + f"does not match index length ({len(names)})") + items_dict: DefaultDict[str, List] = defaultdict(list) extra_locs = [] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6a0f86e133752a..4c1859f1d42347 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2267,6 +2267,14 @@ def test_nested_dict_construction(self): ) tm.assert_frame_equal(result, expected) + def test_object_array_does_not_copy(self): + a = np.array(["a", "b"], dtype="object") + b = np.array([["a", "b"], ["c", "d"]], dtype="object") + df = DataFrame(a) + assert np.shares_memory(df.values, a) + df2 = DataFrame(b) + assert np.shares_memory(df2.values, b) + def test_from_tzaware_object_array(self): # GH#26825 2D object array of tzaware timestamps should not raise dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")