From cbc97f0e7ac49f5a78eb982f933281dc879a7e2a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Oct 2020 10:45:28 -0700 Subject: [PATCH 01/31] ENH: allow non-consolidation in constructors --- pandas/core/frame.py | 82 +++++++++++++++++++++++---- pandas/core/internals/construction.py | 42 +++++++++++--- pandas/core/internals/managers.py | 17 ++++-- pandas/core/ops/__init__.py | 2 +- 4 files changed, 118 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1f9987d9d3f5b..4af6097a51f73 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -361,6 +361,8 @@ class DataFrame(NDFrame): Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input. + consolidate : bool or None, default None + Whether to consolidate the arrays in the new DataFrame. See Also -------- @@ -437,12 +439,16 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, + consolidate=None, ): if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) + if consolidate is None: + consolidate = not copy + if isinstance(data, DataFrame): data = data._mgr @@ -457,7 +463,7 @@ def __init__( ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype, consolidate=consolidate) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords @@ -474,7 +480,14 @@ def __init__( data[mask] = fill_value else: data = data.copy() - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -482,11 +495,26 @@ def __init__( data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict( + data, index, columns, dtype=dtype, consolidate=consolidate + ) elif getattr(data, "name", None) is not None: - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) + mgr = init_dict( + {data.name: data}, + index, + columns, + dtype=dtype, + consolidate=consolidate, + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): @@ -510,11 +538,27 @@ def __init__( else: index = ibase.default_index(len(data)) - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr( + arrays, + columns, + index, + columns, + dtype=dtype, + consolidate=consolidate, + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) else: - mgr = init_dict({}, index, columns, dtype=dtype) + mgr = init_dict( + {}, index, columns, dtype=dtype, consolidate=consolidate + ) # For data is scalar else: if index is None or columns is None: @@ -530,7 +574,9 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + mgr = arrays_to_mgr( + values, columns, index, columns, dtype=None, consolidate=consolidate + ) else: # Attempt to coerce to a numpy array try: @@ -550,7 +596,12 @@ def __init__( ) mgr = init_ndarray( - values, index, columns, dtype=values.dtype, copy=False + values, + index, + columns, + dtype=values.dtype, + copy=False, + consolidate=consolidate, ) NDFrame.__init__(self, mgr) @@ -1665,6 +1716,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, + consolidate: bool = True, ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. @@ -1692,6 +1744,8 @@ def from_records( decimal.Decimal) to floating point, useful for SQL result sets. nrows : int, default None Number of rows to read if data is an iterator. + consolidate: bool, default True + Whether to consolidate the arrays in the new DataFrame. Returns ------- @@ -1827,7 +1881,9 @@ def from_records( arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) + mgr = arrays_to_mgr( + arrays, arr_columns, result_index, columns, consolidate=consolidate + ) return cls(mgr) @@ -2006,6 +2062,7 @@ def _from_arrays( index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, + consolidate: bool = True, ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2026,6 +2083,8 @@ def _from_arrays( stored in a block (numpy ndarray or ExtensionArray), have the same length as and are aligned with the index, and that `columns` and `index` are ensured to be an Index object. + consolidate: bool, default True + Whether to consolidate the passed arrays in the new DataFrame. Returns ------- @@ -2041,6 +2100,7 @@ def _from_arrays( columns, dtype=dtype, verify_integrity=verify_integrity, + consolidate=consolidate, ) return cls(mgr) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6244f1bf0a2d2..618e06ad30d8a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -65,6 +65,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, + consolidate: bool = True, ): """ Segregate Series based on type and coerce into matrices. @@ -91,7 +92,9 @@ def arrays_to_mgr( # from BlockManager perspective axes = [columns, index] - return create_block_manager_from_arrays(arrays, arr_names, axes) + return create_block_manager_from_arrays( + arrays, arr_names, axes, consolidate=consolidate + ) def masked_rec_array_to_mgr( @@ -130,7 +133,9 @@ def masked_rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) + mgr = arrays_to_mgr( + arrays, arr_columns, index, columns, dtype, consolidate=True + ) # FIXME: dont hardcode if copy: mgr = mgr.copy() @@ -141,7 +146,14 @@ def masked_rec_array_to_mgr( # DataFrame Constructor Interface -def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): +def init_ndarray( + values, + index, + columns, + dtype: Optional[DtypeObj], + copy: bool, + consolidate: bool = True, +): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): @@ -170,7 +182,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) - return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + return arrays_to_mgr( + [values], columns, index, columns, dtype=dtype, consolidate=consolidate + ) elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 @@ -184,7 +198,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if columns is None: columns = Index(range(len(values))) - return arrays_to_mgr(values, columns, index, columns, dtype=dtype) + return arrays_to_mgr( + values, columns, index, columns, dtype=dtype, consolidate=consolidate + ) # by definition an array here # the dtypes will be coerced to a single dtype @@ -233,10 +249,18 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): else: block_values = [values] - return create_block_manager_from_blocks(block_values, [columns, index]) + return create_block_manager_from_blocks( + block_values, [columns, index], consolidate=consolidate + ) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def init_dict( + data: Dict, + index, + columns, + dtype: Optional[DtypeObj] = None, + consolidate: bool = True, +): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. @@ -282,7 +306,9 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr( + arrays, data_names, index, columns, dtype=dtype, consolidate=consolidate + ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f2480adce89b4..2a08e2dcbe136 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1595,7 +1595,9 @@ def fast_xs(self, loc): # Constructor Helpers -def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: +def create_block_manager_from_blocks( + blocks, axes: List[Index], consolidate: bool = True +) -> BlockManager: try: if len(blocks) == 1 and not isinstance(blocks[0], Block): # if blocks[0] is of length 0, return empty blocks @@ -1610,7 +1612,8 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: ] mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() + if consolidate: + mgr._consolidate_inplace() return mgr except ValueError as e: @@ -1620,7 +1623,10 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: def create_block_manager_from_arrays( - arrays, names: Index, axes: List[Index] + arrays, + names: Index, + axes: List[Index], + consolidate: bool = True, ) -> BlockManager: assert isinstance(names, Index) assert isinstance(axes, list) @@ -1629,10 +1635,11 @@ def create_block_manager_from_arrays( try: blocks = form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr except ValueError as e: raise construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr def construction_error(tot_items, block_shape, axes, e=None): diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 36e3a0e37c1ae..c8a968e61983f 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -272,7 +272,7 @@ def dispatch_to_series(left, right, func, axis: Optional[int] = None): raise NotImplementedError(right) return type(left)._from_arrays( - arrays, left.columns, left.index, verify_integrity=False + arrays, left.columns, left.index, verify_integrity=False, consolidate=False ) From 5c94129f69f33e40e9edfcd58e3f980aa42b6360 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Oct 2020 15:00:42 -0700 Subject: [PATCH 02/31] mypy fixup --- pandas/tests/arithmetic/conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 6286711ac6113..c808644e007b0 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -221,7 +221,9 @@ def mismatched_freq(request): # ------------------------------------------------------------------ -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func) +@pytest.fixture( + params=[pd.Index, pd.Series, pd.DataFrame], ids=id_func # type: ignore[list-item] +) def box(request): """ Several array-like containers that should have effectively identical From d653c5435729c6bb4a9e6d51445d85932ec70e9a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Oct 2020 10:45:28 -0700 Subject: [PATCH 03/31] ENH: allow non-consolidation in constructors --- pandas/core/frame.py | 84 +++++++++++++++++++++++---- pandas/core/internals/construction.py | 42 +++++++++++--- pandas/core/internals/managers.py | 17 ++++-- 3 files changed, 118 insertions(+), 25 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9cc9c9ef200cd..8e2e4c77f5fc0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -421,6 +421,8 @@ class DataFrame(NDFrame, OpsMixin): Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input. + consolidate : bool or None, default None + Whether to consolidate the arrays in the new DataFrame. See Also -------- @@ -508,12 +510,16 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, + consolidate=None, ): if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) + if consolidate is None: + consolidate = not copy + if isinstance(data, DataFrame): data = data._mgr @@ -528,7 +534,7 @@ def __init__( ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict(data, index, columns, dtype=dtype, consolidate=consolidate) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords @@ -545,7 +551,14 @@ def __init__( data[mask] = fill_value else: data = data.copy() - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -553,11 +566,26 @@ def __init__( data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns - mgr = init_dict(data, index, columns, dtype=dtype) + mgr = init_dict( + data, index, columns, dtype=dtype, consolidate=consolidate + ) elif getattr(data, "name", None) is not None: - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) + mgr = init_dict( + {data.name: data}, + index, + columns, + dtype=dtype, + consolidate=consolidate, + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) # For data is list-like, or Iterable (will consume into list) elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): @@ -581,11 +609,27 @@ def __init__( else: index = ibase.default_index(len(data)) - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) + mgr = arrays_to_mgr( + arrays, + columns, + index, + columns, + dtype=dtype, + consolidate=consolidate, + ) else: - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) + mgr = init_ndarray( + data, + index, + columns, + dtype=dtype, + copy=copy, + consolidate=consolidate, + ) else: - mgr = init_dict({}, index, columns, dtype=dtype) + mgr = init_dict( + {}, index, columns, dtype=dtype, consolidate=consolidate + ) # For data is scalar else: if index is None or columns is None: @@ -601,7 +645,9 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + mgr = arrays_to_mgr( + values, columns, index, columns, dtype=None, consolidate=consolidate + ) else: # Attempt to coerce to a numpy array try: @@ -621,7 +667,12 @@ def __init__( ) mgr = init_ndarray( - values, index, columns, dtype=values.dtype, copy=False + values, + index, + columns, + dtype=values.dtype, + copy=False, + consolidate=consolidate, ) NDFrame.__init__(self, mgr) @@ -1733,6 +1784,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, + consolidate: bool = True, ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. @@ -1760,6 +1812,8 @@ def from_records( decimal.Decimal) to floating point, useful for SQL result sets. nrows : int, default None Number of rows to read if data is an iterator. + consolidate: bool, default True + Whether to consolidate the arrays in the new DataFrame. Returns ------- @@ -1895,7 +1949,9 @@ def from_records( arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) + mgr = arrays_to_mgr( + arrays, arr_columns, result_index, columns, consolidate=consolidate + ) return cls(mgr) @@ -2074,6 +2130,7 @@ def _from_arrays( index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, + consolidate: bool = True, ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2094,6 +2151,8 @@ def _from_arrays( stored in a block (numpy ndarray or ExtensionArray), have the same length as and are aligned with the index, and that `columns` and `index` are ensured to be an Index object. + consolidate: bool, default True + Whether to consolidate the passed arrays in the new DataFrame. Returns ------- @@ -2109,6 +2168,7 @@ def _from_arrays( columns, dtype=dtype, verify_integrity=verify_integrity, + consolidate=consolidate, ) return cls(mgr) @@ -6047,7 +6107,7 @@ def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): raise NotImplementedError(right) return type(self)._from_arrays( - arrays, self.columns, self.index, verify_integrity=False + arrays, self.columns, self.index, verify_integrity=False, consolidate=False ) def _combine_frame(self, other: DataFrame, func, fill_value=None): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index eefd1a604f894..d0ec0a72744ed 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -66,6 +66,7 @@ def arrays_to_mgr( columns, dtype: Optional[DtypeObj] = None, verify_integrity: bool = True, + consolidate: bool = True, ): """ Segregate Series based on type and coerce into matrices. @@ -92,7 +93,9 @@ def arrays_to_mgr( # from BlockManager perspective axes = [columns, index] - return create_block_manager_from_arrays(arrays, arr_names, axes) + return create_block_manager_from_arrays( + arrays, arr_names, axes, consolidate=consolidate + ) def masked_rec_array_to_mgr( @@ -131,7 +134,9 @@ def masked_rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) + mgr = arrays_to_mgr( + arrays, arr_columns, index, columns, dtype, consolidate=True + ) # FIXME: dont hardcode if copy: mgr = mgr.copy() @@ -142,7 +147,14 @@ def masked_rec_array_to_mgr( # DataFrame Constructor Interface -def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): +def init_ndarray( + values, + index, + columns, + dtype: Optional[DtypeObj], + copy: bool, + consolidate: bool = True, +): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): @@ -171,7 +183,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) - return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + return arrays_to_mgr( + [values], columns, index, columns, dtype=dtype, consolidate=consolidate + ) elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 @@ -185,7 +199,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if columns is None: columns = Index(range(len(values))) - return arrays_to_mgr(values, columns, index, columns, dtype=dtype) + return arrays_to_mgr( + values, columns, index, columns, dtype=dtype, consolidate=consolidate + ) # by definition an array here # the dtypes will be coerced to a single dtype @@ -235,10 +251,18 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): else: block_values = [values] - return create_block_manager_from_blocks(block_values, [columns, index]) + return create_block_manager_from_blocks( + block_values, [columns, index], consolidate=consolidate + ) -def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): +def init_dict( + data: Dict, + index, + columns, + dtype: Optional[DtypeObj] = None, + consolidate: bool = True, +): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. @@ -284,7 +308,9 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) + return arrays_to_mgr( + arrays, data_names, index, columns, dtype=dtype, consolidate=consolidate + ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 168dba25ba29c..104547dd7555d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1648,7 +1648,9 @@ def fast_xs(self, loc): # Constructor Helpers -def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: +def create_block_manager_from_blocks( + blocks, axes: List[Index], consolidate: bool = True +) -> BlockManager: try: if len(blocks) == 1 and not isinstance(blocks[0], Block): # if blocks[0] is of length 0, return empty blocks @@ -1665,7 +1667,8 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: ] mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() + if consolidate: + mgr._consolidate_inplace() return mgr except ValueError as e: @@ -1675,7 +1678,10 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: def create_block_manager_from_arrays( - arrays, names: Index, axes: List[Index] + arrays, + names: Index, + axes: List[Index], + consolidate: bool = True, ) -> BlockManager: assert isinstance(names, Index) assert isinstance(axes, list) @@ -1687,10 +1693,11 @@ def create_block_manager_from_arrays( try: blocks = _form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr except ValueError as e: raise construction_error(len(arrays), arrays[0].shape, axes, e) + if consolidate: + mgr._consolidate_inplace() + return mgr def construction_error(tot_items, block_shape, axes, e=None): From 396daba7690b4054ef0ad8f4b2ec4f3441fab304 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 3 Jan 2021 22:17:46 -0800 Subject: [PATCH 04/31] BUG: respect copy=False in constructing DataFrame from dict --- pandas/_testing/__init__.py | 2 +- pandas/conftest.py | 4 +- pandas/core/frame.py | 84 +++----------------- pandas/core/groupby/groupby.py | 4 +- pandas/core/internals/construction.py | 26 +++--- pandas/core/internals/managers.py | 65 ++++++++++++--- pandas/tests/arithmetic/test_numeric.py | 4 +- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/frame/test_constructors.py | 59 +++++++++++++- pandas/tests/frame/test_reductions.py | 2 +- pandas/tests/indexing/test_loc.py | 1 + pandas/tests/series/indexing/test_setitem.py | 16 ++-- 12 files changed, 154 insertions(+), 115 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 0591fc6afd633..ab281f3240dbb 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -650,7 +650,7 @@ def getPeriodData(nper=None): # make frame def makeTimeDataFrame(nper=None, freq="B"): data = getTimeSeriesData(nper, freq) - return DataFrame(data) + return DataFrame(data)._consolidate() def makeDataFrame(): diff --git a/pandas/conftest.py b/pandas/conftest.py index 2862f7c957abc..80dea694a447f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -695,7 +695,7 @@ def float_frame(): [30 rows x 4 columns] """ - return DataFrame(tm.getSeriesData()) + return DataFrame(tm.getSeriesData())._consolidate() # ---------------------------------------------------------------- @@ -1189,7 +1189,7 @@ def any_nullable_int_dtype(request): @pytest.fixture(params=tm.ALL_EA_INT_DTYPES + tm.FLOAT_EA_DTYPES) -def any_numeric_dtype(request): +def any_nullable_numeric_dtype(request): """ Parameterized fixture for any nullable integer dtype and any float ea dtypes. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8517542fdb021..c88d784aa81e1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -427,8 +427,6 @@ class DataFrame(NDFrame, OpsMixin): Data type to force. Only a single dtype is allowed. If None, infer. copy : bool, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input. - consolidate : bool or None, default None - Whether to consolidate the arrays in the new DataFrame. See Also -------- @@ -516,16 +514,12 @@ def __init__( columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, copy: bool = False, - consolidate=None, ): if data is None: data = {} if dtype is not None: dtype = self._validate_dtype(dtype) - if consolidate is None: - consolidate = not copy - if isinstance(data, DataFrame): data = data._mgr @@ -540,7 +534,7 @@ def __init__( ) elif isinstance(data, dict): - mgr = init_dict(data, index, columns, dtype=dtype, consolidate=consolidate) + mgr = init_dict(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords @@ -551,14 +545,7 @@ def __init__( # a masked array else: data = sanitize_masked_array(data) - mgr = init_ndarray( - data, - index, - columns, - dtype=dtype, - copy=copy, - consolidate=consolidate, - ) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: @@ -566,26 +553,11 @@ def __init__( data = {k: data[k] for k in data_columns} if columns is None: columns = data_columns - mgr = init_dict( - data, index, columns, dtype=dtype, consolidate=consolidate - ) + mgr = init_dict(data, index, columns, dtype=dtype) elif getattr(data, "name", None) is not None: - mgr = init_dict( - {data.name: data}, - index, - columns, - dtype=dtype, - consolidate=consolidate, - ) + mgr = init_dict({data.name: data}, index, columns, dtype=dtype) else: - mgr = init_ndarray( - data, - index, - columns, - dtype=dtype, - copy=copy, - consolidate=consolidate, - ) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) # For data is list-like, or Iterable (will consume into list) elif is_list_like(data): @@ -598,27 +570,11 @@ def __init__( arrays, columns, index = nested_data_to_arrays( data, columns, index, dtype ) - mgr = arrays_to_mgr( - arrays, - columns, - index, - columns, - dtype=dtype, - consolidate=consolidate, - ) + mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) else: - mgr = init_ndarray( - data, - index, - columns, - dtype=dtype, - copy=copy, - consolidate=consolidate, - ) + mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) else: - mgr = init_dict( - {}, index, columns, dtype=dtype, consolidate=consolidate - ) + mgr = init_dict({}, index, columns, dtype=dtype) # For data is scalar else: if index is None or columns is None: @@ -635,21 +591,14 @@ def __init__( construct_1d_arraylike_from_scalar(data, len(index), dtype) for _ in range(len(columns)) ] - mgr = arrays_to_mgr( - values, columns, index, columns, dtype=None, consolidate=consolidate - ) + mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) else: values = construct_2d_arraylike_from_scalar( data, len(index), len(columns), dtype, copy ) mgr = init_ndarray( - values, - index, - columns, - dtype=values.dtype, - copy=False, - consolidate=consolidate, + values, index, columns, dtype=values.dtype, copy=False ) NDFrame.__init__(self, mgr) @@ -1761,7 +1710,6 @@ def from_records( columns=None, coerce_float: bool = False, nrows=None, - consolidate: bool = True, ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. @@ -1789,8 +1737,6 @@ def from_records( decimal.Decimal) to floating point, useful for SQL result sets. nrows : int, default None Number of rows to read if data is an iterator. - consolidate: bool, default True - Whether to consolidate the arrays in the new DataFrame. Returns ------- @@ -1930,9 +1876,7 @@ def from_records( arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = arrays_to_mgr( - arrays, arr_columns, result_index, columns, consolidate=consolidate - ) + mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns) return cls(mgr) @@ -2111,7 +2055,6 @@ def _from_arrays( index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, - consolidate: bool = True, ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2132,8 +2075,6 @@ def _from_arrays( stored in a block (numpy ndarray or ExtensionArray), have the same length as and are aligned with the index, and that `columns` and `index` are ensured to be an Index object. - consolidate: bool, default True - Whether to consolidate the passed arrays in the new DataFrame. Returns ------- @@ -2149,7 +2090,6 @@ def _from_arrays( columns, dtype=dtype, verify_integrity=verify_integrity, - consolidate=consolidate, ) return cls(mgr) @@ -6115,7 +6055,7 @@ def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): raise NotImplementedError(right) return type(self)._from_arrays( - arrays, self.columns, self.index, verify_integrity=False, consolidate=False + arrays, self.columns, self.index, verify_integrity=False ) def _combine_frame(self, other: DataFrame, func, fill_value=None): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index aef4c036abc65..93cfa67d709b6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1764,7 +1764,9 @@ def describe(self, **kwargs): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T - return result.unstack() + # FIXME: not being consolidated breaks + # test_describe_with_duplicate_output_column_names + return result._consolidate().unstack() @final def resample(self, rule, *args, **kwargs): diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 30e3bffa0ec77..c5487ab796438 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -135,9 +135,7 @@ def masked_rec_array_to_mgr( if columns is None: columns = arr_columns - mgr = arrays_to_mgr( - arrays, arr_columns, index, columns, dtype, consolidate=True - ) # FIXME: dont hardcode + mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) if copy: mgr = mgr.copy() @@ -154,7 +152,6 @@ def init_ndarray( columns, dtype: Optional[DtypeObj], copy: bool, - consolidate: bool = True, ): # input must be a ndarray, list, Series, index @@ -184,9 +181,7 @@ def init_ndarray( if columns is None: columns = Index(range(len(values))) - return arrays_to_mgr( - values, columns, index, columns, dtype=dtype, consolidate=consolidate - ) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype @@ -236,17 +231,16 @@ def init_ndarray( else: block_values = [values] - return create_block_manager_from_blocks( - block_values, [columns, index], consolidate=consolidate - ) + return create_block_manager_from_blocks(block_values, [columns, index]) def init_dict( data: Dict, index, columns, + *, dtype: Optional[DtypeObj] = None, - consolidate: bool = True, + copy: bool = True, ): """ Segregate Series based on type and coerce into matrices. @@ -281,6 +275,8 @@ def init_dict( val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() + arrays = list(arrays) + else: keys = list(data.keys()) columns = data_names = Index(keys) @@ -291,8 +287,14 @@ def init_dict( arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] + + if copy: + # arrays_to_mgr (via form_blocks) won't make copies for EAs + arrays = [x if not is_extension_array_dtype(x) else x.copy() for x in arrays] + # TODO: can we get rid of the dt64tz special case above? + return arrays_to_mgr( - arrays, data_names, index, columns, dtype=dtype, consolidate=consolidate + arrays, data_names, index, columns, dtype=dtype, consolidate=copy ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5959a31ca10e2..f465dd55cb468 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -16,7 +16,7 @@ import numpy as np -from pandas._libs import internals as libinternals, lib +from pandas._libs import NaT, internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Shape from pandas.errors import PerformanceWarning from pandas.util._validators import validate_bool_kwarg @@ -956,7 +956,15 @@ def fast_xs(self, loc: int) -> ArrayLike: # Such assignment may incorrectly coerce NaT to None # result[blk.mgr_locs] = blk._slice((slice(None), loc)) for i, rl in enumerate(blk.mgr_locs): - result[rl] = blk.iget((i, loc)) + out = blk.iget((i, loc)) + if is_dtype_equal(blk.dtype, dtype) and dtype == "m8[ns]": + # FIXME: kludge for NaT -> tdnat + # TODO: need a test like test_sum_nanops_timedelta + # where initial DataFrame is not consolidated + if out is NaT: + result[rl] = np.timedelta64("NaT", "ns") + continue + result[rl] = out if isinstance(dtype, ExtensionDtype): result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) @@ -1705,7 +1713,7 @@ def create_block_manager_from_arrays( # Note: just calling extract_array breaks tests that patch PandasArray._typ. arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] try: - blocks = _form_blocks(arrays, names, axes) + blocks = _form_blocks(arrays, names, axes, consolidate) mgr = BlockManager(blocks, axes) except ValueError as e: raise construction_error(len(arrays), arrays[0].shape, axes, e) @@ -1738,7 +1746,7 @@ def construction_error(tot_items, block_shape, axes, e=None): # ----------------------------------------------------------------------- -def _form_blocks(arrays, names: Index, axes) -> List[Block]: +def _form_blocks(arrays, names: Index, axes, consolidate: bool) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) @@ -1764,23 +1772,31 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: blocks: List[Block] = [] if len(items_dict["FloatBlock"]): - float_blocks = _multi_blockify(items_dict["FloatBlock"]) + float_blocks = _multi_blockify( + items_dict["FloatBlock"], consolidate=consolidate + ) blocks.extend(float_blocks) if len(items_dict["ComplexBlock"]): - complex_blocks = _multi_blockify(items_dict["ComplexBlock"]) + complex_blocks = _multi_blockify( + items_dict["ComplexBlock"], consolidate=consolidate + ) blocks.extend(complex_blocks) if len(items_dict["TimeDeltaBlock"]): - timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"]) + timedelta_blocks = _multi_blockify( + items_dict["TimeDeltaBlock"], consolidate=consolidate + ) blocks.extend(timedelta_blocks) if len(items_dict["IntBlock"]): - int_blocks = _multi_blockify(items_dict["IntBlock"]) + int_blocks = _multi_blockify(items_dict["IntBlock"], consolidate=consolidate) blocks.extend(int_blocks) if len(items_dict["DatetimeBlock"]): - datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE) + datetime_blocks = _simple_blockify( + items_dict["DatetimeBlock"], DT64NS_DTYPE, consolidate=consolidate + ) blocks.extend(datetime_blocks) if len(items_dict["DatetimeTZBlock"]): @@ -1791,11 +1807,15 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: blocks.extend(dttz_blocks) if len(items_dict["BoolBlock"]): - bool_blocks = _simple_blockify(items_dict["BoolBlock"], np.bool_) + bool_blocks = _simple_blockify( + items_dict["BoolBlock"], np.bool_, consolidate=consolidate + ) blocks.extend(bool_blocks) if len(items_dict["ObjectBlock"]) > 0: - object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_) + object_blocks = _simple_blockify( + items_dict["ObjectBlock"], np.object_, consolidate=consolidate + ) blocks.extend(object_blocks) if len(items_dict["CategoricalBlock"]) > 0: @@ -1834,11 +1854,14 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: return blocks -def _simple_blockify(tuples, dtype) -> List[Block]: +def _simple_blockify(tuples, dtype, consolidate: bool) -> List[Block]: """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ + if not consolidate: + return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype) + values, placement = _stack_arrays(tuples, dtype) # TODO: CHECK DTYPE? @@ -1849,8 +1872,12 @@ def _simple_blockify(tuples, dtype) -> List[Block]: return [block] -def _multi_blockify(tuples, dtype=None): +def _multi_blockify(tuples, dtype=None, consolidate: bool = True): """ return an array of blocks that potentially have different dtypes """ + + if not consolidate: + return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype) + # group by dtype grouper = itertools.groupby(tuples, lambda x: x[2].dtype) @@ -1865,6 +1892,18 @@ def _multi_blockify(tuples, dtype=None): return new_blocks +def _tuples_to_blocks_no_consolidate(tuples, dtype: Optional[DtypeObj]) -> List[Block]: + # tuples produced within _form_blocks are of the form (placement, whatever, array) + if dtype is not None: + return [ + make_block( + np.atleast_2d(x[2].astype(dtype, copy=False)), placement=x[0], ndim=2 + ) + for x in tuples + ] + return [make_block(np.atleast_2d(x[2]), placement=x[0], ndim=2) for x in tuples] + + def _stack_arrays(tuples, dtype): # fml diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f4f258b559939..89255bc71f55d 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -536,9 +536,7 @@ def test_df_mod_zero_df(self): # GH#3590, modulo as ints df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) - # this is technically wrong, as the integer portion is coerced to float - # ### - first = Series([0, 0, 0, 0], dtype="float64") + first = Series([0, 0, 0, 0], dtype="int64") second = Series([np.nan, np.nan, np.nan, 0]) expected = pd.DataFrame({"first": first, "second": second}) result = df % df diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e6d1cd5f47d8d..764862f7807fb 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1297,7 +1297,7 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) f(df, 0) def test_comparison_protected_from_errstate(self): - missing_df = tm.makeDataFrame() + missing_df = tm.makeDataFrame()._consolidate() missing_df.iloc[0]["A"] = np.nan with np.errstate(invalid="ignore"): expected = missing_df.values < 0 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f408a3ddde04e..0f2a39e582a49 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1840,7 +1840,7 @@ def test_constructor_ndarray_copy(self, float_frame): def test_constructor_series_copy(self, float_frame): series = float_frame._series - df = DataFrame({"A": series["A"]}) + df = DataFrame({"A": series["A"]}, copy=True) df["A"][:] = 5 assert not (series["A"] == 5).all() @@ -2153,6 +2153,63 @@ def test_constructor_list_str_na(self, string_dtype): expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("copy", [False, True]) + def test_dict_nocopy(self, copy, any_nullable_numeric_dtype, any_numpy_dtype): + a = np.array([1, 2], dtype=any_numpy_dtype) + b = np.array([3, 4], dtype=any_numpy_dtype) + if b.dtype.kind in ["u", "s", "U", "S"]: + return + + c = pd.array([1, 2], dtype=any_nullable_numeric_dtype) + df = DataFrame({"a": a, "b": b, "c": c}, copy=copy) + + def check_views(): + assert sum(x.values is c for x in df._mgr.blocks) == 1 + assert ( + sum(x.values.base is a for x in df._mgr.blocks if not x.is_extension) + == 1 + ) + assert ( + sum(x.values.base is b for x in df._mgr.blocks if not x.is_extension) + == 1 + ) + + if not copy: + # constructor preserves views + check_views() + + df.iloc[0, 0] = 0 + df.iloc[0, 1] = 0 + if not copy: + # setitem on non-EA values preserves views + assert sum(x.values is c for x in df._mgr.blocks) == 1 + # TODO: we can call check_views if we stop consolidating + # in setitem_with_indexer + + # FIXME: until GH#35417, iloc.setitem into EA values does not preserve + # view, so we have to check in the other direction + # df.iloc[0, 2] = 0 + # if not copy: + # check_views() + c[0] = 0 + + if copy: + if a.dtype.kind == "M": + assert a[0] == a.dtype.type(1, "ns") + assert b[0] == b.dtype.type(3, "ns") + else: + assert a[0] == a.dtype.type(1) + assert b[0] == b.dtype.type(3) + # FIXME: enable after GH#35417 + # assert c[0] == 1 + assert df.iloc[0, 2] == 1 + else: + # TODO: we can call check_views if we stop consolidating + # in setitem_with_indexer + # FIXME: enable after GH#35417 + # assert b[0] == 0 + assert df.iloc[0, 2] == 0 + class TestDataFrameConstructorWithDatetimeTZ: @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"]) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1c397d6a6a1b5..55e4fa8cee199 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -765,7 +765,7 @@ def test_operators_timedelta64(self): def test_std_timedelta64_skipna_false(self): # GH#37392 tdi = pd.timedelta_range("1 Day", periods=10) - df = DataFrame({"A": tdi, "B": tdi}) + df = DataFrame({"A": tdi, "B": tdi}, copy=True) df.iloc[-2, -1] = pd.NaT result = df.std(skipna=False) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6c5cd0f335faa..3aaaf667227de 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1000,6 +1000,7 @@ def test_loc_setitem_empty_append_raises(self): df.loc[[0, 1], "x"] = data msg = "cannot copy sequence with size 2 to array axis with dimension 0" + msg = "Must have equal len keys and value when setting with an iterable" with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index f79a822481ea0..8e4a6d50f5070 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -167,19 +167,19 @@ def test_setitem_boolean_td64_values_cast_na(self, value): expected = Series([NaT, 1, 2], dtype="timedelta64[ns]") tm.assert_series_equal(series, expected) - def test_setitem_boolean_nullable_int_types(self, any_numeric_dtype): + def test_setitem_boolean_nullable_int_types(self, any_nullable_numeric_dtype): # GH: 26468 - ser = Series([5, 6, 7, 8], dtype=any_numeric_dtype) - ser[ser > 6] = Series(range(4), dtype=any_numeric_dtype) - expected = Series([5, 6, 2, 3], dtype=any_numeric_dtype) + ser = Series([5, 6, 7, 8], dtype=any_nullable_numeric_dtype) + ser[ser > 6] = Series(range(4), dtype=any_nullable_numeric_dtype) + expected = Series([5, 6, 2, 3], dtype=any_nullable_numeric_dtype) tm.assert_series_equal(ser, expected) - ser = Series([5, 6, 7, 8], dtype=any_numeric_dtype) - ser.loc[ser > 6] = Series(range(4), dtype=any_numeric_dtype) + ser = Series([5, 6, 7, 8], dtype=any_nullable_numeric_dtype) + ser.loc[ser > 6] = Series(range(4), dtype=any_nullable_numeric_dtype) tm.assert_series_equal(ser, expected) - ser = Series([5, 6, 7, 8], dtype=any_numeric_dtype) - loc_ser = Series(range(4), dtype=any_numeric_dtype) + ser = Series([5, 6, 7, 8], dtype=any_nullable_numeric_dtype) + loc_ser = Series(range(4), dtype=any_nullable_numeric_dtype) ser.loc[ser > 6] = loc_ser.loc[loc_ser > 1] tm.assert_series_equal(ser, expected) From 11ae1c96f0de361c63ae43526e79a3d859a14d9a Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 3 Jan 2021 22:22:12 -0800 Subject: [PATCH 05/31] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index af11b6543a74b..20f204a2a5c38 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -52,6 +52,7 @@ Other enhancements - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) +- The :class:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made, rather than ignoring it (:issue:`32960`) .. --------------------------------------------------------------------------- From b505267f0e4e5de1ffe354dc06d47ec79b8188dc Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 3 Jan 2021 22:25:52 -0800 Subject: [PATCH 06/31] clean test --- pandas/tests/frame/test_constructors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0f2a39e582a49..d4ff19513ad44 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2157,7 +2157,8 @@ def test_constructor_list_str_na(self, string_dtype): def test_dict_nocopy(self, copy, any_nullable_numeric_dtype, any_numpy_dtype): a = np.array([1, 2], dtype=any_numpy_dtype) b = np.array([3, 4], dtype=any_numpy_dtype) - if b.dtype.kind in ["u", "s", "U", "S"]: + if b.dtype.kind in ["S", "U"]: + # These get cast, making the checks below more cumbersome return c = pd.array([1, 2], dtype=any_nullable_numeric_dtype) From b70d997a0a5d2e34790a3d032a2b20a8b493e681 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Jan 2021 09:52:25 -0800 Subject: [PATCH 07/31] fixed xfail --- pandas/tests/indexing/test_loc.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 3aaaf667227de..2111a25cc0653 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from pandas.compat.numpy import is_numpy_dev import pandas.util._test_decorators as td import pandas as pd @@ -985,7 +984,6 @@ def test_loc_setitem_empty_append_single_value(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe From 09213e09b72ac7d250f456fb5479fbe15aa64d82 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Jan 2021 10:07:53 -0800 Subject: [PATCH 08/31] update whatsnew --- doc/source/whatsnew/v1.3.0.rst | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 242ddec4b01ea..3ba557877154a 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -39,6 +39,21 @@ For example: ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See ref:`window.overview` for performance and functional benefits. (:issue:`15095`) +.. _whatsnew_130.dataframe_honors_copy_with_dict: + +DataFrame constructor honors ``copy=False`` With Dict +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When passing a dictionary to :class:`DataFrame` with (the default) ``copy=False``, +a copy will no longer be made (:issue:`32960`) + +.. ipython:: python + + arr = np.array([1, 2, 3]) + df = pd.DataFrame({"A": arr, "B": arr.copy()}) + arr[0] = 0 + assert df.iloc[0, 0] == 0 + .. _whatsnew_130.enhancements.other: Other enhancements @@ -52,7 +67,6 @@ Other enhancements - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - :func:`pandas.read_excel` can now auto detect .xlsb files (:issue:`35416`) -- The :class:`DataFrame` constructor now uses ``copy`` for dict-inputs to control whether copies of the arrays are made, rather than ignoring it (:issue:`32960`) .. --------------------------------------------------------------------------- From 31bda587d8ab6465ca5d439bc4c2ed0f5037776c Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Jan 2021 13:51:50 -0800 Subject: [PATCH 09/31] de-kludge --- pandas/core/internals/managers.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f465dd55cb468..5bf065093c7b1 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -16,7 +16,7 @@ import numpy as np -from pandas._libs import NaT, internals as libinternals, lib +from pandas._libs import internals as libinternals, lib from pandas._typing import ArrayLike, DtypeObj, Label, Shape from pandas.errors import PerformanceWarning from pandas.util._validators import validate_bool_kwarg @@ -40,7 +40,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject -from pandas.core.construction import extract_array +from pandas.core.construction import ensure_wrapped_if_datetimelike, extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.blocks import ( @@ -952,19 +952,13 @@ def fast_xs(self, loc: int) -> ArrayLike: else: result = np.empty(n, dtype=dtype) + result = ensure_wrapped_if_datetimelike(result) + for blk in self.blocks: # Such assignment may incorrectly coerce NaT to None # result[blk.mgr_locs] = blk._slice((slice(None), loc)) for i, rl in enumerate(blk.mgr_locs): - out = blk.iget((i, loc)) - if is_dtype_equal(blk.dtype, dtype) and dtype == "m8[ns]": - # FIXME: kludge for NaT -> tdnat - # TODO: need a test like test_sum_nanops_timedelta - # where initial DataFrame is not consolidated - if out is NaT: - result[rl] = np.timedelta64("NaT", "ns") - continue - result[rl] = out + result[rl] = blk.iget((i, loc)) if isinstance(dtype, ExtensionDtype): result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) From 37a2c0c510126ce887a1740fcc322b027e9966ac Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 4 Jan 2021 13:55:13 -0800 Subject: [PATCH 10/31] remove no-longer-used msg --- pandas/tests/indexing/test_loc.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 2111a25cc0653..a5748ae60534b 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -997,7 +997,6 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "cannot copy sequence with size 2 to array axis with dimension 0" msg = "Must have equal len keys and value when setting with an iterable" with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data From 185cd998610a38e9422e2c07ddddb1499ab7ca28 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Jan 2021 21:37:44 -0800 Subject: [PATCH 11/31] fix broken test --- pandas/tests/indexing/test_partial.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 0251fb4a0ebd6..7943cba629208 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -157,7 +157,8 @@ def test_partial_setting_mixed_dtype(self): tm.assert_frame_equal(df, DataFrame(columns=["A", "B"], index=[0])) # columns will align - df = DataFrame(columns=["A", "B"]) + # TODO: it isnt great that this behavior depends on consolidation + df = DataFrame(columns=["A", "B"])._consolidate() df.loc[0] = Series(1, index=["B"]) exp = DataFrame([[np.nan, 1]], columns=["A", "B"], index=[0], dtype="float64") From 46f2fcf36ce04edf535b626a9d87f42c1ffcef32 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 21 Jan 2021 13:49:27 -0800 Subject: [PATCH 12/31] always copy when data is None --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a09b6a79fa3c..7906164ec4646 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -519,6 +519,7 @@ def __init__( ): if data is None: data = {} + copy = True if dtype is not None: dtype = self._validate_dtype(dtype) From 590c82092e4323579265b450bc1911c7521a40c6 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 23 Jan 2021 13:15:48 -0800 Subject: [PATCH 13/31] update exception message --- pandas/tests/indexing/test_loc.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 17c5978e36fcf..7f44f22e97e3a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -989,7 +989,12 @@ def test_loc_setitem_empty_append_raises(self): with pytest.raises(KeyError, match=msg): df.loc[[0, 1], "x"] = data - msg = "Must have equal len keys and value when setting with an iterable" + msg = "|".join( + [ + "cannot copy sequence with size 2 to array axis with dimension 0", + "Must have equal len keys and value when setting with an iterable", + ] + ) with pytest.raises(ValueError, match=msg): df.loc[0:2, "x"] = data From fb8f32dc118b7306b063c34718a4a3ca3fe8a920 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 27 Jan 2021 10:05:17 -0800 Subject: [PATCH 14/31] update exception message --- pandas/tests/indexing/test_loc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7f44f22e97e3a..2cfbcecdf4b88 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -992,6 +992,7 @@ def test_loc_setitem_empty_append_raises(self): msg = "|".join( [ "cannot copy sequence with size 2 to array axis with dimension 0", + r"could not broadcast input array from shape \(2,\) into shape \(0,\)" "Must have equal len keys and value when setting with an iterable", ] ) From 78351844672d98bd39da20a5466b1bd799e4406a Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 27 Jan 2021 16:14:41 -0800 Subject: [PATCH 15/31] typo fixup --- pandas/tests/indexing/test_loc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 2cfbcecdf4b88..9b308f7667ccd 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -992,7 +992,7 @@ def test_loc_setitem_empty_append_raises(self): msg = "|".join( [ "cannot copy sequence with size 2 to array axis with dimension 0", - r"could not broadcast input array from shape \(2,\) into shape \(0,\)" + r"could not broadcast input array from shape \(2,\) into shape \(0,\)", "Must have equal len keys and value when setting with an iterable", ] ) From b0a6abd447ec961e649cc35d5a48a9460a78223e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Feb 2021 18:12:05 -0800 Subject: [PATCH 16/31] CI: fix broken asv --- asv_bench/benchmarks/gil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 5d9070de92ec7..30919d43ad394 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,7 +1,7 @@ import numpy as np from pandas import DataFrame, Series, date_range, factorize, read_csv -from pandas.core.algorithms import take_1d +from pandas.core.algorithms import take from .pandas_vb_common import tm @@ -110,7 +110,7 @@ def setup(self, dtype): @test_parallel(num_threads=2) def parallel_take1d(): - take_1d(df["col"].values, indexer) + take(df["col"].values, indexer) self.parallel_take1d = parallel_take1d From bf942ae7dd5de0cf02ee3edf3ae4409da4c91ec8 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 11 Feb 2021 07:28:38 -0800 Subject: [PATCH 17/31] revert --- asv_bench/benchmarks/gil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 30919d43ad394..47523005a877f 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,7 +1,7 @@ import numpy as np from pandas import DataFrame, Series, date_range, factorize, read_csv -from pandas.core.algorithms import take +from pandas.core.algorithms import take_nd from .pandas_vb_common import tm @@ -110,7 +110,7 @@ def setup(self, dtype): @test_parallel(num_threads=2) def parallel_take1d(): - take(df["col"].values, indexer) + take_nd(df["col"].values, indexer) self.parallel_take1d = parallel_take1d From 48e359e38f6ec1be6fdb13efc4f1ea6580d3ed3b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 13 Feb 2021 15:07:56 -0800 Subject: [PATCH 18/31] Default to copy=True for dict data --- pandas/core/frame.py | 8 +++++++- pandas/tests/arithmetic/test_numeric.py | 4 +++- pandas/tests/extension/decimal/test_decimal.py | 13 ++++++++++++- pandas/tests/extension/test_sparse.py | 2 +- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae8ace40f0a3e..fe000ec454fc8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -508,8 +508,11 @@ def __init__( index: Optional[Axes] = None, columns: Optional[Axes] = None, dtype: Optional[Dtype] = None, - copy: bool = False, + copy: Optional[bool] = None, ): + orig_copy = copy # GH#38939 + copy = copy if copy is not None else False + if data is None: data = {} copy = True @@ -520,6 +523,7 @@ def __init__( data = data._mgr if isinstance(data, (BlockManager, ArrayManager)): + copy = copy if copy is not None else False if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -530,6 +534,8 @@ def __init__( ) elif isinstance(data, dict): + # GH#38939 de facto copy defaults to False only in non-dict cases + copy = orig_copy if orig_copy is not None else True mgr = init_dict(data, index, columns, dtype=dtype, copy=copy) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 89255bc71f55d..6d8940a435ba5 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -534,7 +534,9 @@ def test_df_div_zero_series_does_not_commute(self): def test_df_mod_zero_df(self): # GH#3590, modulo as ints - df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + # GH#38939 If we dont pass copy=False, df is consolidated and + # result["first"] is float64 instead of int64 + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}, copy=False) first = Series([0, 0, 0, 0], dtype="int64") second = Series([np.nan, np.nan, np.nan, 0]) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 23b1ce250a5e5..3556b8c8d6848 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -274,7 +274,18 @@ def test_dataframe_constructor_with_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("frame", [True, False]) +@pytest.mark.parametrize( + "frame", + [ + pytest.param( + True, + marks=pytest.mark.xfail( + reason="pd.concat call inside NDFrame.astype reverts the dtype" + ), + ), + False, + ], +) def test_astype_dispatches(frame): # This is a dtype-specific test that ensures Series[decimal].astype # gets all the way through to ExtensionArray.astype diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 766910618d925..9e133211e61c3 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -274,7 +274,7 @@ def test_combine_le(self, data_repeated): def test_fillna_copy_frame(self, data_missing): arr = data_missing.take([1, 1]) - df = pd.DataFrame({"A": arr}) + df = pd.DataFrame({"A": arr}, copy=False) filled_val = df.iloc[0, 0] result = df.fillna(filled_val) From 048e82629ee72c758037f9697f337215e4c78e5d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 15 Feb 2021 20:13:28 -0800 Subject: [PATCH 19/31] troubleshoot docbuild --- pandas/core/internals/construction.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 3e7c08cd4a62b..5fb3755ea07a3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -41,6 +41,7 @@ is_named_tuple, is_object_dtype, ) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, @@ -327,7 +328,13 @@ def init_dict( if copy: # arrays_to_mgr (via form_blocks) won't make copies for EAs - arrays = [x if not is_extension_array_dtype(x) else x.copy() for x in arrays] + # dtype attr check to exclude EADtype-castable strs + arrays = [ + x + if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype) + else x.copy() + for x in arrays + ] # TODO: can we get rid of the dt64tz special case above? return arrays_to_mgr( From 5b3d4197e2b64d225ecf8615ba4b23935b62a588 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Feb 2021 17:56:14 -0800 Subject: [PATCH 20/31] update whatsnew --- doc/source/whatsnew/v1.3.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index ac13fa908e88f..ca1249a173d49 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -49,13 +49,13 @@ For example: DataFrame constructor honors ``copy=False`` With Dict ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -When passing a dictionary to :class:`DataFrame` with (the default) ``copy=False``, +When passing a dictionary to :class:`DataFrame` with ``copy=False``, a copy will no longer be made (:issue:`32960`) .. ipython:: python arr = np.array([1, 2, 3]) - df = pd.DataFrame({"A": arr, "B": arr.copy()}) + df = pd.DataFrame({"A": arr, "B": arr.copy()}, copy=False) df ``df["A"]`` remains a view on ``arr``: From 8b66b1193fc8d959b110ec75c9305b10b595c8ef Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Mar 2021 14:38:50 -0800 Subject: [PATCH 21/31] skip for ArrayManager --- pandas/tests/frame/test_constructors.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3ad815869e8e8..2c082e56d2113 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2297,6 +2297,7 @@ def test_constructor_list_str_na(self, string_dtype): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("copy", [False, True]) + @td.skip_array_manager_not_yet_implemented def test_dict_nocopy(self, copy, any_nullable_numeric_dtype, any_numpy_dtype): a = np.array([1, 2], dtype=any_numpy_dtype) b = np.array([3, 4], dtype=any_numpy_dtype) @@ -2308,13 +2309,18 @@ def test_dict_nocopy(self, copy, any_nullable_numeric_dtype, any_numpy_dtype): df = DataFrame({"a": a, "b": b, "c": c}, copy=copy) def check_views(): - assert sum(x.values is c for x in df._mgr.blocks) == 1 + # written to work for either BlockManager or ArrayManager + assert sum(x is c for x in df._mgr.arrays) == 1 assert ( - sum(x.values.base is a for x in df._mgr.blocks if not x.is_extension) + sum( + x.base is a for x in df._mgr.arrays if isinstance(x.dtype, np.dtype) + ) == 1 ) assert ( - sum(x.values.base is b for x in df._mgr.blocks if not x.is_extension) + sum( + x.base is b for x in df._mgr.arrays if isinstance(x.dtype, np.dtype) + ) == 1 ) @@ -2326,7 +2332,7 @@ def check_views(): df.iloc[0, 1] = 0 if not copy: # setitem on non-EA values preserves views - assert sum(x.values is c for x in df._mgr.blocks) == 1 + assert sum(x is c for x in df._mgr.arrays) == 1 # TODO: we can call check_views if we stop consolidating # in setitem_with_indexer From 54cacfcf9dc3b069ef9959db7d65232fbb9c0e68 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 6 Mar 2021 08:09:30 -0800 Subject: [PATCH 22/31] Update doc/source/whatsnew/v1.3.0.rst Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index fdf9b38c68e80..0a72c12072ec0 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -112,7 +112,7 @@ For more, see :ref:`io.xml` in the user guide on IO tools. .. _whatsnew_130.dataframe_honors_copy_with_dict: -DataFrame constructor honors ``copy=False`` With Dict +DataFrame constructor honors ``copy=False`` with dict ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When passing a dictionary to :class:`DataFrame` with ``copy=False``, From e11ea68a9b3f4e5b8b0e6f244ce80c93f4cc473e Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 6 Mar 2021 09:41:19 -0800 Subject: [PATCH 23/31] requested edits --- doc/source/whatsnew/v1.3.0.rst | 3 +++ pandas/_testing/__init__.py | 4 ++-- pandas/core/internals/managers.py | 4 ++-- pandas/tests/extension/decimal/array.py | 2 +- pandas/tests/frame/test_reductions.py | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index a38cfd9eebfde..baef88db23c43 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -131,6 +131,9 @@ a copy will no longer be made (:issue:`32960`) arr[0] = 0 assert df.iloc[0, 0] == 0 +The default behavior when not passing ``copy`` will remain unchanged, i.e. +a copy will be made. + .. _whatsnew_130.enhancements.other: Other enhancements diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 9753274962185..b38ca516c4393 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -485,12 +485,12 @@ def getPeriodData(nper=None): # make frame def makeTimeDataFrame(nper=None, freq="B"): data = getTimeSeriesData(nper, freq) - return DataFrame(data)._consolidate() + return DataFrame(data) def makeDataFrame() -> DataFrame: data = getSeriesData() - return DataFrame(data)._consolidate() + return DataFrame(data) def getMixedTypeDict(): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 301f287167d0a..774ccce3c142c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1911,12 +1911,12 @@ def _tuples_to_blocks_no_consolidate(tuples, dtype: Optional[DtypeObj]) -> List[ # tuples produced within _form_blocks are of the form (placement, whatever, array) if dtype is not None: return [ - make_block( + new_block( np.atleast_2d(x[1].astype(dtype, copy=False)), placement=x[0], ndim=2 ) for x in tuples ] - return [make_block(np.atleast_2d(x[1]), placement=x[0], ndim=2) for x in tuples] + return [new_block(np.atleast_2d(x[1]), placement=x[0], ndim=2) for x in tuples] def _stack_arrays(tuples, dtype: np.dtype): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9e1c517704743..643dd11c1ccd7 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -147,7 +147,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): return self._from_sequence(result) def copy(self): - return type(self)(self._data.copy()) + return type(self)(self._data.copy(), dtype=self.dtype) def astype(self, dtype, copy=True): if is_dtype_equal(dtype, self._dtype): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 349c88ad2a112..d24320ad17709 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -771,7 +771,7 @@ def test_operators_timedelta64(self): def test_std_timedelta64_skipna_false(self): # GH#37392 tdi = pd.timedelta_range("1 Day", periods=10) - df = DataFrame({"A": tdi, "B": tdi}, copy=True) + df = DataFrame({"A": tdi, "B": tdi}) df.iloc[-2, -1] = pd.NaT result = df.std(skipna=False) From 41c4e7a08ebd47539ca129d57641c0f3142e238e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 14:06:41 -0800 Subject: [PATCH 24/31] test test_df_mod_zero_df with and without copy --- pandas/tests/arithmetic/test_numeric.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 17d2a8c9cfe30..ef86a8e6a1cb0 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -535,12 +535,24 @@ def test_df_div_zero_series_does_not_commute(self): # ------------------------------------------------------------------ # Mod By Zero - def test_df_mod_zero_df(self): + def test_df_mod_zero_df(self, using_array_manager): # GH#3590, modulo as ints + df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}) + # this is technically wrong, as the integer portion is coerced to float + first = Series([0, 0, 0, 0]) + if not using_array_manager: + # INFO(ArrayManager) BlockManager doesn't preserve dtype per column + # while ArrayManager performs op column-wisedoes and thus preserves + # dtype if possible + first = first.astype("float64") + second = Series([np.nan, np.nan, np.nan, 0]) + expected = pd.DataFrame({"first": first, "second": second}) + result = df % df + tm.assert_frame_equal(result, expected) + # GH#38939 If we dont pass copy=False, df is consolidated and # result["first"] is float64 instead of int64 df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}, copy=False) - first = Series([0, 0, 0, 0], dtype="int64") second = Series([np.nan, np.nan, np.nan, 0]) expected = pd.DataFrame({"first": first, "second": second}) From 7260a72f9d0c551a93dbcdc711e90db62c1cd023 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 17:09:32 -0800 Subject: [PATCH 25/31] collect copy-adjusting code in one place --- pandas/conftest.py | 2 +- pandas/core/frame.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index fbfe4bff83686..688ad6dcc5e48 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -749,7 +749,7 @@ def float_frame(): [30 rows x 4 columns] """ - return DataFrame(tm.getSeriesData())._consolidate() + return DataFrame(tm.getSeriesData()) @pytest.fixture diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 648da2d309d30..bfd7b6c491c15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -555,12 +555,17 @@ def __init__( dtype: Optional[Dtype] = None, copy: Optional[bool] = None, ): - orig_copy = copy # GH#38939 - copy = copy if copy is not None else False + + if copy is None: + # GH#38939 + if isinstance(data, dict) or data is None: + # retain pre-GH38939 default behavior + copy = True + else: + copy = False if data is None: data = {} - copy = True if dtype is not None: dtype = self._validate_dtype(dtype) @@ -570,8 +575,7 @@ def __init__( if isinstance(data, (BlockManager, ArrayManager)): # first check if a Manager is passed without any other arguments # -> use fastpath (without checking Manager type) - copy = copy if copy is not None else False - if index is None and columns is None and dtype is None and copy is False: + if index is None and columns is None and dtype is None and not copy: # GH#33357 fastpath NDFrame.__init__(self, data) return @@ -585,7 +589,6 @@ def __init__( elif isinstance(data, dict): # GH#38939 de facto copy defaults to False only in non-dict cases - copy = orig_copy if orig_copy is not None else True mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) elif isinstance(data, ma.MaskedArray): import numpy.ma.mrecords as mrecords From 3ddc3d380c1f401cf5740e583f01e6a4348f1567 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 14 Mar 2021 16:20:39 -0700 Subject: [PATCH 26/31] update docstring --- pandas/core/frame.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3c01183489a90..2ba9f05994df2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -475,8 +475,10 @@ class DataFrame(NDFrame, OpsMixin): RangeIndex (0, 1, 2, ..., n) if no column labels are provided. dtype : dtype, default None Data type to force. Only a single dtype is allowed. If None, infer. - copy : bool, default False - Copy data from inputs. Only affects DataFrame / 2d ndarray input. + copy : bool or None, default None + Copy data from inputs. + For dict data, passing None behaves like copy=True. For all other + data types, passing None behaves like copy=False. See Also -------- From e6bae0f528d2f243f6b9263fc37094e38c4d1af9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 14 Mar 2021 16:36:17 -0700 Subject: [PATCH 27/31] whatsnew, comment --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/tests/frame/test_constructors.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index fd2acc6038d99..b97f8e2be1027 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -479,7 +479,7 @@ Conversion - Bug in :meth:`Series.view` and :meth:`Index.view` when converting between datetime-like (``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64``, ``period``) dtypes (:issue:`39788`) - Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`) - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`) -- +- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`) Strings ^^^^^^^ diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ec723ca1d0f6c..beb10f8ca7bf2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2331,7 +2331,10 @@ def check_views(): df.iloc[0, 0] = 0 df.iloc[0, 1] = 0 if not copy: - # setitem on non-EA values preserves views + # Check that the underlying data behind df["c"] is still `c` + # after setting with iloc. Since we don't know which entry in + # df._mgr.arrays corresponds to df["c"], we just check that exactly + # one of these arrays is `c`. GH#38939 assert sum(x is c for x in df._mgr.arrays) == 1 # TODO: we can call check_views if we stop consolidating # in setitem_with_indexer From 5c4495344dbf60f63b359b43780fffb4cfc63c53 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 14 Mar 2021 19:46:11 -0700 Subject: [PATCH 28/31] mypy fixup --- pandas/core/internals/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5ea900e391b17..0b18877f47c35 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1942,7 +1942,7 @@ def _simple_blockify(tuples, dtype, consolidate: bool) -> List[Block]: return [block] -def _multi_blockify(tuples, dtype: Optional[Dtype] = None, consolidate: bool = True): +def _multi_blockify(tuples, dtype: Optional[DtypeObj] = None, consolidate: bool = True): """ return an array of blocks that potentially have different dtypes """ if not consolidate: From 1b7f7ca3cc2668fd3cd7a1964e1d66f1025b5c77 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 16 Mar 2021 14:51:02 -0700 Subject: [PATCH 29/31] Update pandas/core/frame.py Co-authored-by: Joris Van den Bossche --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2d21cab4a3d7f..78dd54140226c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -478,8 +478,8 @@ class DataFrame(NDFrame, OpsMixin): Data type to force. Only a single dtype is allowed. If None, infer. copy : bool or None, default None Copy data from inputs. - For dict data, passing None behaves like copy=True. For all other - data types, passing None behaves like copy=False. + For dict data, the default of None behaves like ``copy=True``. For DataFrame + or 2d ndarray input, the default of None behaves like ``copy=False``. See Also -------- From ad5485a98b6fb24a267868948a38f631e053b963 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Mar 2021 08:03:29 -0700 Subject: [PATCH 30/31] add versionchanged --- pandas/core/frame.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5de1685fea2b0..6f2edaa300c93 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -481,6 +481,8 @@ class DataFrame(NDFrame, OpsMixin): For dict data, the default of None behaves like ``copy=True``. For DataFrame or 2d ndarray input, the default of None behaves like ``copy=False``. + .. versionchanged:: 1.3.0 + See Also -------- DataFrame.from_records : Constructor from tuples, also record arrays. @@ -561,9 +563,8 @@ def __init__( ): if copy is None: - # GH#38939 if isinstance(data, dict) or data is None: - # retain pre-GH38939 default behavior + # retain pre-GH#38939 default behavior copy = True else: copy = False From 98b6dff1f6a51f1a621b4d99e75dd4aaf32e6d1b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Mar 2021 15:56:40 -0700 Subject: [PATCH 31/31] update bc .values has changed to DTA/TDA --- pandas/tests/frame/test_constructors.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index af6eaf970439a..d618c4cda4f13 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2323,18 +2323,31 @@ def test_dict_nocopy(self, copy, any_nullable_numeric_dtype, any_numpy_dtype): c = pd.array([1, 2], dtype=any_nullable_numeric_dtype) df = DataFrame({"a": a, "b": b, "c": c}, copy=copy) + def get_base(obj): + if isinstance(obj, np.ndarray): + return obj.base + elif isinstance(obj.dtype, np.dtype): + # i.e. DatetimeArray, TimedeltaArray + return obj._ndarray.base + else: + raise TypeError + def check_views(): # written to work for either BlockManager or ArrayManager assert sum(x is c for x in df._mgr.arrays) == 1 assert ( sum( - x.base is a for x in df._mgr.arrays if isinstance(x.dtype, np.dtype) + get_base(x) is a + for x in df._mgr.arrays + if isinstance(x.dtype, np.dtype) ) == 1 ) assert ( sum( - x.base is b for x in df._mgr.arrays if isinstance(x.dtype, np.dtype) + get_base(x) is b + for x in df._mgr.arrays + if isinstance(x.dtype, np.dtype) ) == 1 )