Skip to content

Commit

Permalink
Backport PR #52008 on branch 2.0.x (CoW: Avoid copying Index in Serie…
Browse files Browse the repository at this point in the history
…s constructor) (#52048)

CoW: Avoid copying Index in Series constructor (#52008)
  • Loading branch information
phofl committed Mar 17, 2023
1 parent a498448 commit 065a320
Show file tree
Hide file tree
Showing 14 changed files with 108 additions and 30 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1212,6 +1212,7 @@ Conversion
- Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`)
- Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` raising for extension array dtypes (:issue:`29618`, :issue:`50261`, :issue:`31913`)
- Bug in :meth:`Series` not copying data when created from :class:`Index` and ``dtype`` is equal to ``dtype`` from :class:`Index` (:issue:`52008`)

Strings
^^^^^^^
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/internals.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ class BlockManager:

class BlockValuesRefs:
referenced_blocks: list[weakref.ref]
def __init__(self, blk: SharedBlock) -> None: ...
def __init__(self, blk: SharedBlock | None = ...) -> None: ...
def add_reference(self, blk: SharedBlock) -> None: ...
def add_index_reference(self, index: object) -> None: ...
def has_reference(self) -> bool: ...
7 changes: 5 additions & 2 deletions pandas/_libs/internals.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -877,8 +877,11 @@ cdef class BlockValuesRefs:
cdef:
public list referenced_blocks

def __cinit__(self, blk: SharedBlock) -> None:
self.referenced_blocks = [weakref.ref(blk)]
def __cinit__(self, blk: SharedBlock | None = None) -> None:
if blk is not None:
self.referenced_blocks = [weakref.ref(blk)]
else:
self.referenced_blocks = []

def add_reference(self, blk: SharedBlock) -> None:
"""Adds a new reference to our reference collection.
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
index as libindex,
lib,
)
from pandas._libs.internals import BlockValuesRefs
import pandas._libs.join as libjoin
from pandas._libs.lib import (
is_datetime_array,
Expand Down Expand Up @@ -653,9 +654,11 @@ def _simple_new(
result._name = name
result._cache = {}
result._reset_identity()
result._references = refs
if refs is not None:
refs.add_index_reference(result)
result._references = refs
else:
result._references = BlockValuesRefs()
result._references.add_index_reference(result)

return result

Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,8 @@ def inferred_type(self) -> str:
"""Return a string of the type inferred from the values"""
return "interval"

@Appender(Index.memory_usage.__doc__)
# Cannot determine type of "memory_usage"
@Appender(Index.memory_usage.__doc__) # type: ignore[has-type]
def memory_usage(self, deep: bool = False) -> int:
# we don't use an explicit engine
# so return the bytes here
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1233,7 +1233,8 @@ def f(level) -> bool:

return any(f(level) for level in self._inferred_type_levels)

@doc(Index.memory_usage)
# Cannot determine type of "memory_usage"
@doc(Index.memory_usage) # type: ignore[has-type]
def memory_usage(self, deep: bool = False) -> int:
# we are overwriting our base class to avoid
# computing .values here which could materialize
Expand Down
11 changes: 8 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@
internals as libinternals,
lib,
)
from pandas._libs.internals import BlockPlacement
from pandas._libs.internals import (
BlockPlacement,
BlockValuesRefs,
)
from pandas._typing import (
ArrayLike,
AxisInt,
Expand Down Expand Up @@ -1868,11 +1871,13 @@ def from_blocks(
return cls(blocks[0], axes[0], verify_integrity=False)

@classmethod
def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager:
def from_array(
cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
) -> SingleBlockManager:
"""
Constructor for if we have an array that is not yet a Block.
"""
block = new_block(array, placement=slice(0, len(index)), ndim=1)
block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs)
return cls(block, index)

def to_2d_mgr(self, columns: Index) -> BlockManager:
Expand Down
11 changes: 8 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,10 +426,15 @@ def __init__(
raise NotImplementedError(
"initializing a Series from a MultiIndex is not supported"
)

refs = None
if isinstance(data, Index):
if dtype is not None:
# astype copies
data = data.astype(dtype)
data = data.astype(dtype, copy=False)

if using_copy_on_write():
refs = data._references
data = data._values
else:
# GH#24096 we need to ensure the index remains immutable
data = data._values.copy()
Expand Down Expand Up @@ -493,7 +498,7 @@ def __init__(

manager = get_option("mode.data_manager")
if manager == "block":
data = SingleBlockManager.from_array(data, index)
data = SingleBlockManager.from_array(data, index, refs=refs)
elif manager == "array":
data = SingleArrayManager.from_array(data, index)

Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def test_astype_different_timezones(using_copy_on_write):
result = df.astype("datetime64[ns, Europe/Berlin]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))


def test_astype_different_timezones_different_reso(using_copy_on_write):
Expand All @@ -183,9 +183,7 @@ def test_astype_different_timezones_different_reso(using_copy_on_write):
result = df.astype("datetime64[ms, Europe/Berlin]")
if using_copy_on_write:
assert result._mgr._has_no_reference(0)
assert not np.shares_memory(
get_array(df, "a").asi8, get_array(result, "a").asi8
)
assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))


@pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed")
Expand All @@ -202,7 +200,7 @@ def test_astype_arrow_timestamp(using_copy_on_write):
result = df.astype("timestamp[ns][pyarrow]")
if using_copy_on_write:
assert not result._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data)
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._data)


def test_convert_dtypes_infer_objects(using_copy_on_write):
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@

from pandas import (
DataFrame,
DatetimeIndex,
Index,
Period,
PeriodIndex,
Series,
Timedelta,
TimedeltaIndex,
Timestamp,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array
Expand Down Expand Up @@ -82,6 +89,35 @@ def test_series_from_series_with_reindex(using_copy_on_write):
assert not result._mgr.blocks[0].refs.has_reference()


@pytest.mark.parametrize(
"idx",
[
Index([1, 2]),
DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
],
)
def test_series_from_index(using_copy_on_write, idx):
ser = Series(idx)
expected = idx.copy(deep=True)
if using_copy_on_write:
assert np.shares_memory(get_array(ser), get_array(idx))
assert not ser._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(ser), get_array(idx))
ser.iloc[0] = ser.iloc[1]
tm.assert_index_equal(idx, expected)


def test_series_from_index_different_dtypes(using_copy_on_write):
idx = Index([1, 2, 3], dtype="int64")
ser = Series(idx, dtype="int32")
assert not np.shares_memory(get_array(ser), get_array(idx))
if using_copy_on_write:
assert ser._mgr._has_no_reference(0)


@pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
@pytest.mark.parametrize("columns", [None, ["a"]])
def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/copy_view/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from pandas import Series
from pandas import (
Categorical,
Index,
Series,
)
from pandas.core.arrays import BaseMaskedArray


Expand All @@ -10,7 +14,9 @@ def get_array(obj, col=None):
which triggers tracking references / CoW (and we might be testing that
this is done by some other operation).
"""
if isinstance(obj, Series) and (col is None or obj.name == col):
if isinstance(obj, Index):
arr = obj._values
elif isinstance(obj, Series) and (col is None or obj.name == col):
arr = obj._values
else:
assert col is not None
Expand All @@ -19,4 +25,6 @@ def get_array(obj, col=None):
arr = obj._get_column_array(icol)
if isinstance(arr, BaseMaskedArray):
return arr._data
return arr
elif isinstance(arr, Categorical):
return arr
return getattr(arr, "_ndarray", arr)
8 changes: 4 additions & 4 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -849,7 +849,7 @@ def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli):
tz = tz_naive_fixture

dti = date_range("2016-01-01", periods=3, tz=tz)
ser = Series(dti)
ser = Series(dti.copy(deep=True))

values = ser._values

Expand Down Expand Up @@ -877,7 +877,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
key = slice(0, 1)

dti = date_range("2016-01-01", periods=3, tz=tz)
ser = Series(dti)
ser = Series(dti.copy(deep=True))

values = ser._values

Expand All @@ -897,7 +897,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo
def test_setitem_td64_scalar(self, indexer_sli, scalar):
# dispatching _can_hold_element to underling TimedeltaArray
tdi = timedelta_range("1 Day", periods=3)
ser = Series(tdi)
ser = Series(tdi.copy(deep=True))

values = ser._values
values._validate_setitem_value(scalar)
Expand All @@ -915,7 +915,7 @@ def test_setitem_td64_string_values(self, indexer_sli, key, box):
key = slice(0, 1)

tdi = timedelta_range("1 Day", periods=3)
ser = Series(tdi)
ser = Series(tdi.copy(deep=True))

values = ser._values

Expand Down
18 changes: 13 additions & 5 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,15 +404,18 @@ def test_setitem_mask_smallint_no_upcast(self):


class TestSetitemViewCopySemantics:
def test_setitem_invalidates_datetime_index_freq(self):
def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write):
# GH#24096 altering a datetime64tz Series inplace invalidates the
# `freq` attribute on the underlying DatetimeIndex

dti = date_range("20130101", periods=3, tz="US/Eastern")
ts = dti[1]
ser = Series(dti)
assert ser._values is not dti
assert ser._values._ndarray.base is not dti._data._ndarray.base
if using_copy_on_write:
assert ser._values._ndarray.base is dti._data._ndarray.base
else:
assert ser._values._ndarray.base is not dti._data._ndarray.base
assert dti.freq == "D"
ser.iloc[1] = NaT
assert ser._values.freq is None
Expand All @@ -423,15 +426,20 @@ def test_setitem_invalidates_datetime_index_freq(self):
assert dti[1] == ts
assert dti.freq == "D"

def test_dt64tz_setitem_does_not_mutate_dti(self):
def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write):
# GH#21907, GH#24096
dti = date_range("2016-01-01", periods=10, tz="US/Pacific")
ts = dti[0]
ser = Series(dti)
assert ser._values is not dti
assert ser._values._ndarray.base is not dti._data._ndarray.base
if using_copy_on_write:
assert ser._values._ndarray.base is dti._data._ndarray.base
assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base
else:
assert ser._values._ndarray.base is not dti._data._ndarray.base
assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base

assert ser._mgr.arrays[0] is not dti
assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base

ser[::3] = NaT
assert ser[0] is NaT
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2056,6 +2056,14 @@ def test_series_constructor_ea_all_na(self):
)
tm.assert_series_equal(result, expected)

def test_series_from_index_dtype_equal_does_not_copy(self):
# GH#52008
idx = Index([1, 2, 3])
expected = idx.copy(deep=True)
ser = Series(idx, dtype="int64")
ser.iloc[0] = 100
tm.assert_index_equal(idx, expected)


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down

0 comments on commit 065a320

Please sign in to comment.