pandas-dev · jreback · Mar 31, 2021 · Oct 5, 2020 · Oct 5, 2020 · Oct 5, 2020
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -110,6 +110,30 @@ both XPath 1.0 and XSLT 1.0 is available. (:issue:`27554`)
 
 For more, see :ref:`io.xml` in the user guide on IO tools.
 
+.. _whatsnew_130.dataframe_honors_copy_with_dict:
+
+DataFrame constructor honors ``copy=False`` with dict
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When passing a dictionary to :class:`DataFrame` with ``copy=False``,
+a copy will no longer be made (:issue:`32960`)
+
+.. ipython:: python
+
+    arr = np.array([1, 2, 3])
+    df = pd.DataFrame({"A": arr, "B": arr.copy()}, copy=False)
+    df
+
+``df["A"]`` remains a view on ``arr``:
+
+.. ipython:: python
+
+    arr[0] = 0
+    assert df.iloc[0, 0] == 0
+
+The default behavior when not passing ``copy`` will remain unchanged, i.e.
+a copy will be made.
+
 .. _whatsnew_130.enhancements.other:
 
 Other enhancements
@@ -513,6 +537,8 @@ Conversion
 - Bug in creating a :class:`DataFrame` from an empty ``np.recarray`` not retaining the original dtypes (:issue:`40121`)
 - Bug in :class:`DataFrame` failing to raise ``TypeError`` when constructing from a ``frozenset`` (:issue:`40163`)
 - Bug in :class:`Index` construction silently ignoring a passed ``dtype`` when the data cannot be cast to that dtype (:issue:`21311`)
+- Bug in :class:`DataFrame` construction with a dictionary containing an arraylike with ``ExtensionDtype`` and ``copy=True`` failing to make a copy (:issue:`38939`)
+-
 
 Strings
 ^^^^^^^

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -476,8 +476,10 @@ class DataFrame(NDFrame, OpsMixin):
         RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed. If None, infer.
-    copy : bool, default False
-        Copy data from inputs. Only affects DataFrame / 2d ndarray input.
+    copy : bool or None, default None
+        Copy data from inputs.
+        For dict data, the default of None behaves like ``copy=True``.  For DataFrame
+        or 2d ndarray input, the default of None behaves like ``copy=False``.
 
     See Also
     --------
@@ -555,8 +557,17 @@ def __init__(
         index: Optional[Axes] = None,
         columns: Optional[Axes] = None,
         dtype: Optional[Dtype] = None,
-        copy: bool = False,
+        copy: Optional[bool] = None,
     ):
+
+        if copy is None:
+            # GH#38939
+            if isinstance(data, dict) or data is None:
+                # retain pre-GH38939 default behavior
+                copy = True
+            else:
+                copy = False
+
         if data is None:
             data = {}
         if dtype is not None:
@@ -565,18 +576,13 @@ def __init__(
         if isinstance(data, DataFrame):
             data = data._mgr
 
-        # first check if a Manager is passed without any other arguments
-        # -> use fastpath (without checking Manager type)
-        if (
-            index is None
-            and columns is None
-            and dtype is None
-            and copy is False
-            and isinstance(data, (BlockManager, ArrayManager))
-        ):
-            # GH#33357 fastpath
-            NDFrame.__init__(self, data)
-            return
+        if isinstance(data, (BlockManager, ArrayManager)):
+            # first check if a Manager is passed without any other arguments
+            # -> use fastpath (without checking Manager type)
+            if index is None and columns is None and dtype is None and not copy:
+                # GH#33357 fastpath
+                NDFrame.__init__(self, data)
+                return
 
         manager = get_option("mode.data_manager")
 
@@ -586,7 +592,8 @@ def __init__(
             )
 
         elif isinstance(data, dict):
-            mgr = dict_to_mgr(data, index, columns, dtype=dtype, typ=manager)
+            # GH#38939 de facto copy defaults to False only in non-dict cases
+            mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
         elif isinstance(data, ma.MaskedArray):
             import numpy.ma.mrecords as mrecords
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1807,7 +1807,9 @@ def describe(self, **kwargs):
             result = self.apply(lambda x: x.describe(**kwargs))
             if self.axis == 1:
                 return result.T
-            return result.unstack()
+            # FIXME: not being consolidated breaks
+            #  test_describe_with_duplicate_output_column_names
+            return result._consolidate().unstack()
 
     @final
     def resample(self, rule, *args, **kwargs):

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -101,9 +101,11 @@ def arrays_to_mgr(
     arr_names,
     index,
     columns,
+    *,
     dtype: Optional[DtypeObj] = None,
     verify_integrity: bool = True,
     typ: Optional[str] = None,
+    consolidate: bool = True,
 ) -> Manager:
     """
     Segregate Series based on type and coerce into matrices.
@@ -131,7 +133,9 @@ def arrays_to_mgr(
     axes = [columns, index]
 
     if typ == "block":
-        return create_block_manager_from_arrays(arrays, arr_names, axes)
+        return create_block_manager_from_arrays(
+            arrays, arr_names, axes, consolidate=consolidate
+        )
     elif typ == "array":
         if len(columns) != len(arrays):
             assert len(arrays) == 0
@@ -181,7 +185,7 @@ def rec_array_to_mgr(
     if columns is None:
         columns = arr_columns
 
-    mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype, typ=typ)
+    mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype=dtype, typ=typ)
 
     if copy:
         mgr = mgr.copy()
@@ -376,7 +380,13 @@ def maybe_squeeze_dt64tz(dta: ArrayLike) -> ArrayLike:
 
 
 def dict_to_mgr(
-    data: Dict, index, columns, dtype: Optional[DtypeObj], typ: str
+    data: Dict,
+    index,
+    columns,
+    *,
+    dtype: Optional[DtypeObj] = None,
+    typ: str = "block",
+    copy: bool = True,
 ) -> Manager:
     """
     Segregate Series based on type and coerce into matrices.
@@ -414,6 +424,8 @@ def dict_to_mgr(
             val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype)
             arrays.loc[missing] = [val] * missing.sum()
 
+        arrays = list(arrays)
+
     else:
         keys = list(data.keys())
         columns = data_names = Index(keys)
@@ -424,7 +436,21 @@ def dict_to_mgr(
         arrays = [
             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
         ]
-    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ)
+
+    if copy:
+        # arrays_to_mgr (via form_blocks) won't make copies for EAs
+        # dtype attr check to exclude EADtype-castable strs
+        arrays = [
+            x
+            if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype)
+            else x.copy()
+            for x in arrays
+        ]
+        # TODO: can we get rid of the dt64tz special case above?
+
+    return arrays_to_mgr(
+        arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
+    )
 
 
 def nested_data_to_arrays(

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -53,7 +53,10 @@
 
 import pandas.core.algorithms as algos
 from pandas.core.arrays.sparse import SparseDtype
-from pandas.core.construction import extract_array
+from pandas.core.construction import (
+    ensure_wrapped_if_datetimelike,
+    extract_array,
+)
 from pandas.core.indexers import maybe_convert_indices
 from pandas.core.indexes.api import (
     Float64Index,
@@ -991,6 +994,8 @@ def fast_xs(self, loc: int) -> ArrayLike:
             # Any]]"
             result = np.empty(n, dtype=dtype)  # type: ignore[arg-type]
 
+        result = ensure_wrapped_if_datetimelike(result)
+
         for blk in self.blocks:
             # Such assignment may incorrectly coerce NaT to None
             # result[blk.mgr_locs] = blk._slice((slice(None), loc))
@@ -1715,7 +1720,7 @@ def set_values(self, values: ArrayLike):
 
 
 def create_block_manager_from_blocks(
-    blocks: List[Block], axes: List[Index]
+    blocks: List[Block], axes: List[Index], consolidate: bool = True
 ) -> BlockManager:
     try:
         mgr = BlockManager(blocks, axes)
@@ -1725,7 +1730,8 @@ def create_block_manager_from_blocks(
         tot_items = sum(arr.shape[0] for arr in arrays)
         raise construction_error(tot_items, arrays[0].shape[1:], axes, err)
 
-    mgr._consolidate_inplace()
+    if consolidate:
+        mgr._consolidate_inplace()
     return mgr
 
 
@@ -1735,7 +1741,10 @@ def _extract_array(obj):
 
 
 def create_block_manager_from_arrays(
-    arrays, names: Index, axes: List[Index]
+    arrays,
+    names: Index,
+    axes: List[Index],
+    consolidate: bool = True,
 ) -> BlockManager:
     assert isinstance(names, Index)
     assert isinstance(axes, list)
@@ -1744,12 +1753,13 @@ def create_block_manager_from_arrays(
     arrays = [_extract_array(x) for x in arrays]
 
     try:
-        blocks = _form_blocks(arrays, names, axes)
+        blocks = _form_blocks(arrays, names, axes, consolidate)
         mgr = BlockManager(blocks, axes)
-        mgr._consolidate_inplace()
-        return mgr
     except ValueError as e:
         raise construction_error(len(arrays), arrays[0].shape, axes, e)
+    if consolidate:
+        mgr._consolidate_inplace()
+    return mgr
 
 
 def construction_error(
@@ -1782,7 +1792,7 @@ def construction_error(
 
 
 def _form_blocks(
-    arrays: List[ArrayLike], names: Index, axes: List[Index]
+    arrays: List[ArrayLike], names: Index, axes: List[Index], consolidate: bool
 ) -> List[Block]:
     # put "leftover" items in float bucket, where else?
     # generalize?
@@ -1808,15 +1818,21 @@ def _form_blocks(
 
     blocks: List[Block] = []
     if len(items_dict["NumericBlock"]):
-        numeric_blocks = _multi_blockify(items_dict["NumericBlock"])
+        numeric_blocks = _multi_blockify(
+            items_dict["NumericBlock"], consolidate=consolidate
+        )
         blocks.extend(numeric_blocks)
 
     if len(items_dict["TimeDeltaBlock"]):
-        timedelta_blocks = _multi_blockify(items_dict["TimeDeltaBlock"])
+        timedelta_blocks = _multi_blockify(
+            items_dict["TimeDeltaBlock"], consolidate=consolidate
+        )
         blocks.extend(timedelta_blocks)
 
     if len(items_dict["DatetimeBlock"]):
-        datetime_blocks = _simple_blockify(items_dict["DatetimeBlock"], DT64NS_DTYPE)
+        datetime_blocks = _simple_blockify(
+            items_dict["DatetimeBlock"], DT64NS_DTYPE, consolidate=consolidate
+        )
         blocks.extend(datetime_blocks)
 
     if len(items_dict["DatetimeTZBlock"]):
@@ -1827,7 +1843,9 @@ def _form_blocks(
         blocks.extend(dttz_blocks)
 
     if len(items_dict["ObjectBlock"]) > 0:
-        object_blocks = _simple_blockify(items_dict["ObjectBlock"], np.object_)
+        object_blocks = _simple_blockify(
+            items_dict["ObjectBlock"], np.object_, consolidate=consolidate
+        )
         blocks.extend(object_blocks)
 
     if len(items_dict["CategoricalBlock"]) > 0:
@@ -1866,11 +1884,14 @@ def _form_blocks(
     return blocks
 
 
-def _simple_blockify(tuples, dtype) -> List[Block]:
+def _simple_blockify(tuples, dtype, consolidate: bool) -> List[Block]:
     """
     return a single array of a block that has a single dtype; if dtype is
     not None, coerce to this dtype
     """
+    if not consolidate:
+        return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype)
+
     values, placement = _stack_arrays(tuples, dtype)
 
     # TODO: CHECK DTYPE?
@@ -1881,8 +1902,12 @@ def _simple_blockify(tuples, dtype) -> List[Block]:
     return [block]
 
 
-def _multi_blockify(tuples, dtype: Optional[Dtype] = None):
+def _multi_blockify(tuples, dtype: Optional[DtypeObj] = None, consolidate: bool = True):
     """ return an array of blocks that potentially have different dtypes """
+
+    if not consolidate:
+        return _tuples_to_blocks_no_consolidate(tuples, dtype=dtype)
+
     # group by dtype
     grouper = itertools.groupby(tuples, lambda x: x[1].dtype)
 
@@ -1902,6 +1927,18 @@ def _multi_blockify(tuples, dtype: Optional[Dtype] = None):
     return new_blocks
 
 
+def _tuples_to_blocks_no_consolidate(tuples, dtype: Optional[DtypeObj]) -> List[Block]:
+    # tuples produced within _form_blocks are of the form (placement, whatever, array)
+    if dtype is not None:
+        return [
+            new_block(
+                np.atleast_2d(x[1].astype(dtype, copy=False)), placement=x[0], ndim=2
+            )
+            for x in tuples
+        ]
+    return [new_block(np.atleast_2d(x[1]), placement=x[0], ndim=2) for x in tuples]
+
+
 def _stack_arrays(tuples, dtype: np.dtype):
 
     placement, arrays = zip(*tuples)

diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
@@ -538,7 +538,6 @@ def test_df_div_zero_series_does_not_commute(self):
     def test_df_mod_zero_df(self, using_array_manager):
         # GH#3590, modulo as ints
         df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
-
         # this is technically wrong, as the integer portion is coerced to float
         first = Series([0, 0, 0, 0])
         if not using_array_manager:
@@ -551,6 +550,15 @@ def test_df_mod_zero_df(self, using_array_manager):
         result = df % df
         tm.assert_frame_equal(result, expected)
 
+        # GH#38939 If we dont pass copy=False, df is consolidated and
+        #  result["first"] is float64 instead of int64
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}, copy=False)
+        first = Series([0, 0, 0, 0], dtype="int64")
+        second = Series([np.nan, np.nan, np.nan, 0])
+        expected = pd.DataFrame({"first": first, "second": second})
+        result = df % df
+        tm.assert_frame_equal(result, expected)
+
     def test_df_mod_zero_array(self):
         # GH#3590, modulo as ints
         df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})

diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -150,7 +150,7 @@ def take(self, indexer, allow_fill=False, fill_value=None):
         return self._from_sequence(result)
 
     def copy(self):
-        return type(self)(self._data.copy())
+        return type(self)(self._data.copy(), dtype=self.dtype)
 
     def astype(self, dtype, copy=True):
         if is_dtype_equal(dtype, self._dtype):