diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1e723493a4cc8..63902b53ea36d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -110,6 +110,30 @@ both XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`) For more, see :ref:`io.xml` in the user guide on IO tools. +.. _whatsnew_130.dataframe_honors_copy_with_dict: + +DataFrame constructor honors ``copy=False`` with dict +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When passing a dictionary to :class:`DataFrame` with ``copy=False``, +a copy will no longer be made (:issue:`32960`) + +.. ipython:: python + + arr = np.array([1, 2, 3]) + df = pd.DataFrame({"A": arr, "B": arr.copy()}, copy=False) + df + +``df["A"]`` remains a view on ``arr``: + +.. ipython:: python + + arr[0] = 0 + assert df.iloc[0, 0] == 0 + +The default behavior when not passing ``copy`` will remain unchanged, i.e. +a copy will be made. + .. _whatsnew_130.enhancements.other: Other enhancements @@ -546,6 +570,8 @@ Conversion - Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`) - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`) +- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`) +- Strings ^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 510bdfcb0079f..6f2edaa300c93 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -476,8 +476,12 @@ class DataFrame(NDFrame, OpsMixin): RangeIndex (0, 1, 2, ..., n) if no column labels are provided. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. - copy : bool, default False - Copy data from inputs. Only affects DataFrame / 2d ndarray input. + copy : bool or None, default None + Copy data from inputs. + For dict data, the default of None behaves like ``copy=True``. For DataFrame + or 2d ndarray input, the default of None behaves like ``copy=False``. + + .. versionchanged:: 1.3.0 See Also -------- @@ -555,8 +559,16 @@ def __init__( index: Optional[Axes] = None, columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, - copy: bool = False, + copy: Optional[bool] = None, ): + + if copy is None: + if isinstance(data, dict) or data is None: + # retain pre-GH#38939 default behavior + copy = True + else: + copy = False + if data is None: data = {} if dtype is not None: @@ -565,18 +577,13 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr - # first check if a Manager is passed without any other arguments - # -> use fastpath (without checking Manager type) - if ( - index is None - and columns is None - and dtype is None - and copy is False - and isinstance(data, (BlockManager, ArrayManager)) - ): - # GH#33357 fastpath - NDFrame.__init__(self, data) - return + if isinstance(data, (BlockManager, ArrayManager)): + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if index is None and columns is None and dtype is None and not copy: + # GH#33357 fastpath + NDFrame.__init__(self, data) + return manager = get_option("mode.data_manager") @@ -586,7 +593,8 @@ def __init__( ) elif isinstance(data, dict): - mgr = dict_to_mgr(data, index, columns, dtype=dtype, typ=manager) + # GH#38939 de facto copy defaults to False only in non-dict cases + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1ee38834c5758..0ecd798986c53 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1807,7 +1807,9 @@ def describe(self, **kwargs): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T - return result.unstack() + # FIXME: not being consolidated breaks + # test_describe_with_duplicate_output_column_names + return result._consolidate().unstack() @final def resample(self, rule, *args, **kwargs): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 9959174373034..5b4b710838ef8 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -101,9 +101,11 @@ def arrays_to_mgr( arr_names, index, columns, + *, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, typ: Optional[str] = None, + consolidate: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. @@ -131,7 +133,9 @@ def arrays_to_mgr( axes = [columns, index] if typ == "block": - return create_block_manager_from_arrays(arrays, arr_names, axes) + return create_block_manager_from_arrays( + arrays, arr_names, axes, consolidate=consolidate + ) elif typ == "array": if len(columns) != len(arrays): assert len(arrays) == 0 @@ -181,7 +185,7 @@ def rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ) + mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype=dtype, typ=typ) if copy: mgr = mgr.copy() @@ -376,7 +380,13 @@ def maybe_squeeze_dt64tz(dta: ArrayLike) -> ArrayLike: def dict_to_mgr( - data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str + data: Dict, + index, + columns, + *, + dtype: Optional[DtypeObj] = None, + typ: str = "block", + copy: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. @@ -414,6 +424,8 @@ def dict_to_mgr( val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() + arrays = list(arrays) + else: keys = list(data.keys()) columns = data_names = Index(keys) @@ -424,7 +436,21 @@ def dict_to_mgr( arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ) + + if copy: + # arrays_to_mgr (via form_blocks) won't make copies for EAs + # dtype attr check to exclude EADtype-castable strs + arrays = [ + x + if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype) + else x.copy() + for x in arrays + ] + # TODO: can we get rid of the dt64tz special case above? + + return arrays_to_mgr( + arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy + ) def nested_data_to_arrays( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 69338abcd7d58..6681015856d6b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -53,7 +53,10 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype -from pandas.core.construction import extract_array +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import ( Float64Index, @@ -991,6 +994,8 @@ def fast_xs(self, loc: int) -> ArrayLike: # Any]]" result = np.empty(n, dtype=dtype) # type: ignore[arg-type] + result = ensure_wrapped_if_datetimelike(result) + for blk in self.blocks: # Such assignment may incorrectly coerce NaT to None # result[blk.mgr_locs] = blk._slice((slice(None), loc)) @@ -1693,7 +1698,7 @@ def set_values(self, values: ArrayLike): def create_block_manager_from_blocks( - blocks: List[Block], axes: List[Index] + blocks: List[Block], axes: List[Index], consolidate: bool = True ) -> BlockManager: try: mgr = BlockManager(blocks, axes) @@ -1703,7 +1708,8 @@ def create_block_manager_from_blocks( tot_items = sum(arr.shape[0] for arr in arrays) raise construction_error(tot_items, arrays[0].shape[1:], axes, err) - mgr._consolidate_inplace() + if consolidate: + mgr._consolidate_inplace() return mgr @@ -1713,7 +1719,10 @@ def _extract_array(obj): def create_block_manager_from_arrays( - arrays, names: Index, axes: List[Index] + arrays, + names: Index, + axes: List[Index], + consolidate: bool = True, ) -> BlockManager: assert isinstance(names, Index) assert isinstance(axes, list) @@ -1722,12 +1731,13 @@ def create_block_manager_from_arrays( arrays = [_extract_array(x) for x in arrays] try: - blocks = _form_blocks(arrays, names, axes) + blocks = _form_blocks(arrays, names, axes, consolidate) mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr except ValueError as e: raise construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr def construction_error( @@ -1760,7 +1770,7 @@ def construction_error( def _form_blocks( - arrays: List[ArrayLike], names: Index, axes: List[Index] + arrays: List[ArrayLike], names: Index, axes: List[Index], consolidate: bool ) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? @@ -1786,15 +1796,21 @@ def _form_blocks( blocks: List[Block] = [] if len(items_dict["NumericBlock"]): - numeric_blocks = _multi_blockify(items_dict["NumericBlock"]) + numeric_blocks = _multi_blockify( + items_dict["NumericBlock"], consolidate=consolidate + ) blocks.extend(numeric_blocks) if len(items_dict["TimeDeltaBlock"]): - timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) + timedelta_blocks = _multi_blockify( + items_dict["TimeDeltaBlock"], consolidate=consolidate + ) blocks.extend(timedelta_blocks) if len(items_dict["DatetimeBlock"]): - datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE) + datetime_blocks = _simple_blockify( + items_dict["DatetimeBlock"], DT64NS_DTYPE, consolidate=consolidate + ) blocks.extend(datetime_blocks) if len(items_dict["DatetimeTZBlock"]): @@ -1805,7 +1821,9 @@ def _form_blocks( blocks.extend(dttz_blocks) if len(items_dict["ObjectBlock"]) > 0: - object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_) + object_blocks = _simple_blockify( + items_dict["ObjectBlock"], np.object_, consolidate=consolidate + ) blocks.extend(object_blocks) if len(items_dict["CategoricalBlock"]) > 0: @@ -1844,11 +1862,14 @@ def _form_blocks( return blocks -def _simple_blockify(tuples, dtype) -> List[Block]: +def _simple_blockify(tuples, dtype, consolidate: bool) -> List[Block]: """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ + if not consolidate: + return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype) + values, placement = _stack_arrays(tuples, dtype) # TODO: CHECK DTYPE? @@ -1859,8 +1880,12 @@ def _simple_blockify(tuples, dtype) -> List[Block]: return [block] -def _multi_blockify(tuples, dtype: Optional[Dtype] = None): +def _multi_blockify(tuples, dtype: Optional[DtypeObj] = None, consolidate: bool = True): """ return an array of blocks that potentially have different dtypes """ + + if not consolidate: + return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype) + # group by dtype grouper = itertools.groupby(tuples, lambda x: x[1].dtype) @@ -1880,6 +1905,18 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None): return new_blocks +def _tuples_to_blocks_no_consolidate(tuples, dtype: Optional[DtypeObj]) -> List[Block]: + # tuples produced within _form_blocks are of the form (placement, whatever, array) + if dtype is not None: + return [ + new_block( + np.atleast_2d(x[1].astype(dtype, copy=False)), placement=x[0], ndim=2 + ) + for x in tuples + ] + return [new_block(np.atleast_2d(x[1]), placement=x[0], ndim=2) for x in tuples] + + def _stack_arrays(tuples, dtype: np.dtype): placement, arrays = zip(*tuples) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 1e2622d6a8fcd..ef86a8e6a1cb0 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -538,7 +538,6 @@ def test_df_div_zero_series_does_not_commute(self): def test_df_mod_zero_df(self, using_array_manager): # GH#3590, modulo as ints df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) - # this is technically wrong, as the integer portion is coerced to float first = Series([0, 0, 0, 0]) if not using_array_manager: @@ -551,6 +550,15 @@ def test_df_mod_zero_df(self, using_array_manager): result = df % df tm.assert_frame_equal(result, expected) + # GH#38939 If we dont pass copy=False, df is consolidated and + # result["first"] is float64 instead of int64 + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}, copy=False) + first = Series([0, 0, 0, 0], dtype="int64") + second = Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({"first": first, "second": second}) + result = df % df + tm.assert_frame_equal(result, expected) + def test_df_mod_zero_array(self): # GH#3590, modulo as ints df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 366b24e328642..68dbdd9e0bf35 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -150,7 +150,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): return self._from_sequence(result) def copy(self): - return type(self)(self._data.copy()) + return type(self)(self._data.copy(), dtype=self.dtype) def astype(self, dtype, copy=True): if is_dtype_equal(dtype, self._dtype): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 437160e78741b..55f9d85574f94 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -261,7 +261,18 @@ def test_dataframe_constructor_with_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("frame", [True, False]) +@pytest.mark.parametrize( + "frame", + [ + pytest.param( + True, + marks=pytest.mark.xfail( + reason="pd.concat call inside NDFrame.astype reverts the dtype" + ), + ), + False, + ], +) def test_astype_dispatches(frame): # This is a dtype-specific test that ensures Series[decimal].astype # gets all the way through to ExtensionArray.astype diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 0613c727dec98..759277a47f62b 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -285,7 +285,7 @@ def test_combine_le(self, data_repeated): def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) - df = pd.DataFrame({"A": arr}) + df = pd.DataFrame({"A": arr}, copy=False) filled_val = df.iloc[0, 0] result = df.fillna(filled_val) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b76a44b3c86be..d618c4cda4f13 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1997,7 +1997,7 @@ def test_constructor_ndarray_copy(self, float_frame): def test_constructor_series_copy(self, float_frame): series = float_frame._series - df = DataFrame({"A": series["A"]}) + df = DataFrame({"A": series["A"]}, copy=True) df["A"][:] = 5 assert not (series["A"] == 5).all() @@ -2311,6 +2311,86 @@ def test_constructor_list_str_na(self, string_dtype): expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("copy", [False, True]) + @td.skip_array_manager_not_yet_implemented + def test_dict_nocopy(self, copy, any_nullable_numeric_dtype, any_numpy_dtype): + a = np.array([1, 2], dtype=any_numpy_dtype) + b = np.array([3, 4], dtype=any_numpy_dtype) + if b.dtype.kind in ["S", "U"]: + # These get cast, making the checks below more cumbersome + return + + c = pd.array([1, 2], dtype=any_nullable_numeric_dtype) + df = DataFrame({"a": a, "b": b, "c": c}, copy=copy) + + def get_base(obj): + if isinstance(obj, np.ndarray): + return obj.base + elif isinstance(obj.dtype, np.dtype): + # i.e. DatetimeArray, TimedeltaArray + return obj._ndarray.base + else: + raise TypeError + + def check_views(): + # written to work for either BlockManager or ArrayManager + assert sum(x is c for x in df._mgr.arrays) == 1 + assert ( + sum( + get_base(x) is a + for x in df._mgr.arrays + if isinstance(x.dtype, np.dtype) + ) + == 1 + ) + assert ( + sum( + get_base(x) is b + for x in df._mgr.arrays + if isinstance(x.dtype, np.dtype) + ) + == 1 + ) + + if not copy: + # constructor preserves views + check_views() + + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + if not copy: + # Check that the underlying data behind df["c"] is still `c` + # after setting with iloc. Since we don't know which entry in + # df._mgr.arrays corresponds to df["c"], we just check that exactly + # one of these arrays is `c`. GH#38939 + assert sum(x is c for x in df._mgr.arrays) == 1 + # TODO: we can call check_views if we stop consolidating + # in setitem_with_indexer + + # FIXME: until GH#35417, iloc.setitem into EA values does not preserve + # view, so we have to check in the other direction + # df.iloc[0, 2] = 0 + # if not copy: + # check_views() + c[0] = 0 + + if copy: + if a.dtype.kind == "M": + assert a[0] == a.dtype.type(1, "ns") + assert b[0] == b.dtype.type(3, "ns") + else: + assert a[0] == a.dtype.type(1) + assert b[0] == b.dtype.type(3) + # FIXME: enable after GH#35417 + # assert c[0] == 1 + assert df.iloc[0, 2] == 1 + else: + # TODO: we can call check_views if we stop consolidating + # in setitem_with_indexer + # FIXME: enable after GH#35417 + # assert b[0] == 0 + assert df.iloc[0, 2] == 0 + class TestDataFrameConstructorWithDatetimeTZ: @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 85accac5a8235..ae07fc6e3b2b3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1063,6 +1063,7 @@ def test_loc_setitem_empty_append_raises(self): [ "cannot copy sequence with size 2 to array axis with dimension 0", r"could not broadcast input array from shape \(2,\) into shape \(0,\)", + "Must have equal len keys and value when setting with an iterable", ] ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b0d41a89931e9..b8680cc4e611e 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -171,7 +171,8 @@ def test_partial_setting_mixed_dtype(self): tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0])) # columns will align - df = DataFrame(columns=["A", "B"]) + # TODO: it isn't great that this behavior depends on consolidation + df = DataFrame(columns=["A", "B"])._consolidate() df.loc[0] = Series(1, index=["B"]) exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64")