Skip to content

Commit

Permalink
Backport PR #52022 on branch 2.0.x (API / CoW: Copy arrays by default…
Browse files Browse the repository at this point in the history
… in Series constructor) (#52282)

API / CoW: Copy arrays by default in Series constructor (#52022)
  • Loading branch information
phofl committed Mar 29, 2023
1 parent 5508e73 commit bca610a
Show file tree
Hide file tree
Showing 14 changed files with 64 additions and 21 deletions.
7 changes: 4 additions & 3 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,13 @@ Copy-on-Write improvements
- The :class:`DataFrame` constructor, when constructing a DataFrame from a
:class:`Series` and specifying ``copy=False``, will now respect Copy-on-Write.

- The :class:`DataFrame` constructor, when constructing from a NumPy array,
will now copy the array by default to avoid mutating the :class:`DataFrame`
- The :class:`DataFrame` and :class:`Series` constructors, when constructing from
a NumPy array, will now copy the array by default to avoid mutating
the :class:`DataFrame` / :class:`Series`
when mutating the array. Specify ``copy=False`` to get the old behavior.
When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
behavior when the NumPy array is modified after creation of the
:class:`DataFrame`.
:class:`DataFrame` / :class:`Series`.

- The :meth:`DataFrame.from_records` will now respect Copy-on-Write when called
with a :class:`DataFrame`.
Expand Down
2 changes: 1 addition & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def _create_series(index):
"""Helper for the _series dict"""
size = len(index)
data = np.random.randn(size)
return Series(data, index=index, name="a")
return Series(data, index=index, name="a", copy=False)


_series = {
Expand Down
12 changes: 10 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
validate_percentile,
)

from pandas.core.dtypes.astype import astype_is_view
from pandas.core.dtypes.cast import (
LossySetitemError,
convert_dtypes,
Expand Down Expand Up @@ -370,14 +371,14 @@ def __init__(
index=None,
dtype: Dtype | None = None,
name=None,
copy: bool = False,
copy: bool | None = None,
fastpath: bool = False,
) -> None:
if (
isinstance(data, (SingleBlockManager, SingleArrayManager))
and index is None
and dtype is None
and copy is False
and (copy is False or copy is None)
):
if using_copy_on_write():
data = data.copy(deep=False)
Expand All @@ -390,6 +391,13 @@ def __init__(
self.name = name
return

if isinstance(data, (ExtensionArray, np.ndarray)):
if copy is not False and using_copy_on_write():
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
data = data.copy()
if copy is None:
copy = False

# we are called internally, so short-circuit
if fastpath:
# data is a ndarray, index is defined
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/arrays/categorical/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
# GH#26988
cat = Categorical(["a", "b"])
expected = Categorical(result)
result = pd.Series(cat).replace(to_replace, value)._values
result = pd.Series(cat, copy=False).replace(to_replace, value)._values

tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError, match=expected_error_msg):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)

pd.Series(cat).replace(to_replace, value, inplace=True)
pd.Series(cat, copy=False).replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)


Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,9 @@ def test_astype_arrow_timestamp(using_copy_on_write):
result = df.astype("timestamp[ns][pyarrow]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._data)
# TODO(CoW): arrow is not setting copy=False in the Series constructor
# under the hood
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")._data)


def test_convert_dtypes_infer_objects(using_copy_on_write):
Expand Down
38 changes: 35 additions & 3 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_series_from_series(dtype, using_copy_on_write):
result = Series(ser, dtype=dtype)

# the shallow copy still shares memory
assert np.shares_memory(ser.values, result.values)
assert np.shares_memory(get_array(ser), get_array(result))

if using_copy_on_write:
assert result._mgr.blocks[0].refs.has_reference()
Expand All @@ -40,13 +40,13 @@ def test_series_from_series(dtype, using_copy_on_write):
result.iloc[0] = 0
assert ser.iloc[0] == 1
# mutating triggered a copy-on-write -> no longer shares memory
assert not np.shares_memory(ser.values, result.values)
assert not np.shares_memory(get_array(ser), get_array(result))
else:
# mutating shallow copy does mutate original
result.iloc[0] = 0
assert ser.iloc[0] == 0
# and still shares memory
assert np.shares_memory(ser.values, result.values)
assert np.shares_memory(get_array(ser), get_array(result))

# the same when modifying the parent
result = Series(ser, dtype=dtype)
Expand Down Expand Up @@ -90,6 +90,38 @@ def test_series_from_series_with_reindex(using_copy_on_write):
assert not result._mgr.blocks[0].refs.has_reference()


@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
@pytest.mark.parametrize(
"arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
)
def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
if idx is None or dtype is not None:
fastpath = False
ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
ser_orig = ser.copy()
data = getattr(arr, "_data", arr)
if using_copy_on_write:
assert not np.shares_memory(get_array(ser), data)
else:
assert np.shares_memory(get_array(ser), data)

arr[0] = 100
if using_copy_on_write:
tm.assert_series_equal(ser, ser_orig)
else:
expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
tm.assert_series_equal(ser, expected)


@pytest.mark.parametrize("copy", [True, False, None])
def test_series_from_array_different_dtype(using_copy_on_write, copy):
arr = np.array([1, 2, 3], dtype="int64")
ser = Series(arr, dtype="int32", copy=copy)
assert not np.shares_memory(get_array(ser), arr)


@pytest.mark.parametrize(
"idx",
[
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_array_from_scalars(self, data):
assert isinstance(result, type(data))

def test_series_constructor(self, data):
result = pd.Series(data)
result = pd.Series(data, copy=False)
assert result.dtype == data.dtype
assert len(result) == len(data)
if hasattr(result._mgr, "blocks"):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def test_fillna_copy_frame(self, data_missing):

def test_fillna_copy_series(self, data_missing):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
ser = pd.Series(arr, copy=False)
ser_orig = ser.copy()

filled_val = ser[0]
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write):

def test_fillna_copy_series(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
ser = pd.Series(arr, copy=False)

filled_val = ser[0]
result = ser.fillna(filled_val)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1362,7 +1362,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool):

def check_series_setitem(self, elem, index: Index, inplace: bool):
arr = index._data.copy()
ser = Series(arr)
ser = Series(arr, copy=False)

self.check_can_hold_element(ser, elem, inplace)

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -2812,7 +2812,7 @@ def __getitem__(self, ix):
def dtype(self):
return DtypeStub()

series = Series(ExtTypeStub())
series = Series(ExtTypeStub(), copy=False)
res = repr(series) # This line crashed before #33770 was fixed.
expected = "0 [False True]\n" + "1 [ True False]\n" + "dtype: DtypeStub"
assert res == expected
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/reductions/test_stat_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def test_td64_mean(self, box):
tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")

tdarr = tdi._data
obj = box(tdarr)
obj = box(tdarr, copy=False)

result = obj.mean()
expected = np.array(tdarr).mean()
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ def test_setitem_scalar_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

for n in series.index:
msg = "assignment destination is read-only"
Expand All @@ -593,7 +593,7 @@ def test_setitem_slice_into_readonly_backing_data():

array = np.zeros(5)
array.flags.writeable = False # make the array immutable
series = Series(array)
series = Series(array, copy=False)

msg = "assignment destination is read-only"
with pytest.raises(ValueError, match=msg):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self):
# however, copy is False by default
# so this WILL change values
cat = Categorical(["a", "b", "c", "a"])
s = Series(cat)
s = Series(cat, copy=False)
assert s.values is cat
s = s.cat.rename_categories([1, 2, 3])
assert s.values is not cat
Expand Down

0 comments on commit bca610a

Please sign in to comment.