Skip to content

Commit

Permalink
Backport PR #51082 on branch 2.0.x (API / CoW: return read-only numpy…
Browse files Browse the repository at this point in the history
… arrays in .values/to_numpy()) (#51933)

Backport PR #51082: API / CoW: return read-only numpy arrays in .values/to_numpy()

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
meeseeksmachine and jorisvandenbossche committed Mar 13, 2023
1 parent 0bbc9f5 commit 68b409b
Show file tree
Hide file tree
Showing 26 changed files with 274 additions and 53 deletions.
12 changes: 10 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import lib
from pandas._typing import (
Axis,
Expand Down Expand Up @@ -545,10 +547,16 @@ def to_numpy(

result = np.asarray(values, dtype=dtype)

if copy and na_value is lib.no_default:
if (copy and na_value is lib.no_default) or (
not copy and using_copy_on_write()
):
if np.shares_memory(self._values[:2], result[:2]):
# Take slices to improve performance of check
result = result.copy()
if using_copy_on_write() and not copy:
result = result.view()
result.flags.writeable = False
else:
result = result.copy()

return result

Expand Down
8 changes: 7 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1989,7 +1989,13 @@ def empty(self) -> bool_t:
__array_priority__: int = 1000

def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
return np.asarray(self._values, dtype=dtype)
values = self._values
arr = np.asarray(values, dtype=dtype)
if arr is values and using_copy_on_write():
# TODO(CoW) also properly handle extension dtypes
arr = arr.view()
arr.flags.writeable = False
return arr

@final
def __array_ufunc__(
Expand Down
14 changes: 11 additions & 3 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import numpy as np

from pandas._config import using_copy_on_write

from pandas._libs import (
internals as libinternals,
lib,
Expand Down Expand Up @@ -2592,6 +2594,12 @@ def external_values(values: ArrayLike) -> ArrayLike:
# NB: for datetime64tz this is different from np.asarray(values), since
# that returns an object-dtype ndarray of Timestamps.
# Avoid raising in .astype in casting from dt64tz to dt64
return values._ndarray
else:
return values
values = values._ndarray

if isinstance(values, np.ndarray) and using_copy_on_write():
values = values.view()
values.flags.writeable = False

# TODO(CoW) we should also mark our ExtensionArrays as read-only

return values
13 changes: 8 additions & 5 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1719,13 +1719,16 @@ def as_array(
arr = np.asarray(blk.get_values())
if dtype:
arr = arr.astype(dtype, copy=False)

if copy:
arr = arr.copy()
elif using_copy_on_write():
arr = arr.view()
arr.flags.writeable = False
else:
arr = self._interleave(dtype=dtype, na_value=na_value)
# The underlying data was copied within _interleave
copy = False

if copy:
arr = arr.copy()
# The underlying data was copied within _interleave, so no need
# to further copy if copy=True or setting na_value

if na_value is not lib.no_default:
arr[isna(arr)] = na_value
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,13 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
array(['1999-12-31T23:00:00.000000000', ...],
dtype='datetime64[ns]')
"""
return np.asarray(self._values, dtype)
values = self._values
arr = np.asarray(values, dtype=dtype)
if arr is values and using_copy_on_write():
# TODO(CoW) also properly handle extension dtypes
arr = arr.view()
arr.flags.writeable = False
return arr

# ----------------------------------------------------------------------
# Unary Methods
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ def converter(*date_cols, col: Hashable):
dayfirst=dayfirst,
errors="ignore",
cache=cache_dates,
).to_numpy()
)._values
else:
try:
result = tools.to_datetime(
Expand Down
112 changes: 112 additions & 0 deletions pandas/tests/copy_view/test_array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm
from pandas.tests.copy_view.util import get_array

# -----------------------------------------------------------------------------
# Copy/view behaviour for accessing underlying array of Series/DataFrame


@pytest.mark.parametrize(
"method",
[lambda ser: ser.values, lambda ser: np.asarray(ser)],
ids=["values", "asarray"],
)
def test_series_values(using_copy_on_write, method):
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()

arr = method(ser)

if using_copy_on_write:
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False

# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)

# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
else:
assert arr.flags.writeable is True
arr[0] = 0
assert ser.iloc[0] == 0


@pytest.mark.parametrize(
"method",
[lambda df: df.values, lambda df: np.asarray(df)],
ids=["values", "asarray"],
)
def test_dataframe_values(using_copy_on_write, using_array_manager, method):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
df_orig = df.copy()

arr = method(df)

if using_copy_on_write:
# .values still gives a view but is read-only
assert np.shares_memory(arr, get_array(df, "a"))
assert arr.flags.writeable is False

# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0, 0] = 0
tm.assert_frame_equal(df, df_orig)

# mutating the series itself still works
df.iloc[0, 0] = 0
assert df.values[0, 0] == 0
else:
assert arr.flags.writeable is True
arr[0, 0] = 0
if not using_array_manager:
assert df.iloc[0, 0] == 0
else:
tm.assert_frame_equal(df, df_orig)


def test_series_to_numpy(using_copy_on_write):
ser = Series([1, 2, 3], name="name")
ser_orig = ser.copy()

# default: copy=False, no dtype or NAs
arr = ser.to_numpy()
if using_copy_on_write:
# to_numpy still gives a view but is read-only
assert np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is False

# mutating series through arr therefore doesn't work
with pytest.raises(ValueError, match="read-only"):
arr[0] = 0
tm.assert_series_equal(ser, ser_orig)

# mutating the series itself still works
ser.iloc[0] = 0
assert ser.values[0] == 0
else:
assert arr.flags.writeable is True
arr[0] = 0
assert ser.iloc[0] == 0

# specify copy=False gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(copy=True)
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True

# specifying a dtype that already causes a copy also gives a writeable array
ser = Series([1, 2, 3], name="name")
arr = ser.to_numpy(dtype="float64")
assert not np.shares_memory(arr, get_array(ser, "name"))
assert arr.flags.writeable is True
20 changes: 13 additions & 7 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def test_setitem2(self):

def test_setitem_boolean(self, float_frame):
df = float_frame.copy()
values = float_frame.values
values = float_frame.values.copy()

df[df["A"] > 0] = 4
values[values[:, 0] > 0] = 4
Expand Down Expand Up @@ -381,16 +381,18 @@ def test_setitem_boolean(self, float_frame):
df[df * 0] = 2

# index with DataFrame
df_orig = df.copy()
mask = df > np.abs(df)
expected = df.copy()
df[df > np.abs(df)] = np.nan
expected.values[mask.values] = np.nan
values = df_orig.values.copy()
values[mask.values] = np.nan
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
tm.assert_frame_equal(df, expected)

# set from DataFrame
expected = df.copy()
df[df > np.abs(df)] = df * 2
np.putmask(expected.values, mask.values, df.values * 2)
np.putmask(values, mask.values, df.values * 2)
expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
tm.assert_frame_equal(df, expected)

def test_setitem_cast(self, float_frame):
Expand Down Expand Up @@ -664,16 +666,20 @@ def test_setitem_fancy_boolean(self, float_frame):
# from 2d, set with booleans
frame = float_frame.copy()
expected = float_frame.copy()
values = expected.values.copy()

mask = frame["A"] > 0
frame.loc[mask] = 0.0
expected.values[mask.values] = 0.0
values[mask.values] = 0.0
expected = DataFrame(values, index=expected.index, columns=expected.columns)
tm.assert_frame_equal(frame, expected)

frame = float_frame.copy()
expected = float_frame.copy()
values = expected.values.copy()
frame.loc[mask, ["A", "B"]] = 0.0
expected.values[mask.values, :2] = 0.0
values[mask.values, :2] = 0.0
expected = DataFrame(values, index=expected.index, columns=expected.columns)
tm.assert_frame_equal(frame, expected)

def test_getitem_fancy_ints(self, float_frame):
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/frame/indexing/test_insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_insert_with_columns_dups(self):
)
tm.assert_frame_equal(df, exp)

def test_insert_item_cache(self, using_array_manager):
def test_insert_item_cache(self, using_array_manager, using_copy_on_write):
df = DataFrame(np.random.randn(4, 3))
ser = df[0]

Expand All @@ -85,9 +85,14 @@ def test_insert_item_cache(self, using_array_manager):
for n in range(100):
df[n + 3] = df[1] * n

ser.values[0] = 99

assert df.iloc[0, 0] == df[0][0]
if using_copy_on_write:
ser.iloc[0] = 99
assert df.iloc[0, 0] == df[0][0]
assert df.iloc[0, 0] != 99
else:
ser.values[0] = 99
assert df.iloc[0, 0] == df[0][0]
assert df.iloc[0, 0] == 99

def test_insert_EA_no_warning(self):
# PerformanceWarning about fragmented frame should not be raised when
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,8 +1002,9 @@ def test_setitem_boolean_mask(self, mask_type, float_frame):
result = df.copy()
result[mask] = np.nan

expected = df.copy()
expected.values[np.array(mask)] = np.nan
expected = df.values.copy()
expected[np.array(mask)] = np.nan
expected = DataFrame(expected, index=df.index, columns=df.columns)
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="Currently empty indexers are treated as all False")
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,7 +982,7 @@ def test_where_dt64_2d():

df = DataFrame(dta, columns=["A", "B"])

mask = np.asarray(df.isna())
mask = np.asarray(df.isna()).copy()
mask[:, 1] = True

# setting all of one column, none of the other
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/frame/methods/test_copy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def test_copy_index_name_checking(self, float_frame, attr):
getattr(cp, attr).name = "foo"
assert getattr(float_frame, attr).name is None

@td.skip_copy_on_write_invalid_test
def test_copy_cache(self):
# GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
df = DataFrame({"a": [1]})
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def test_join(self, multiindex_dataframe_random_data):
b = frame.loc[frame.index[2:], ["B", "C"]]

joined = a.join(b, how="outer").reindex(frame.index)
expected = frame.copy().values
expected = frame.copy().values.copy()
expected[np.isnan(joined.values)] = np.nan
expected = DataFrame(expected, index=frame.index, columns=frame.columns)

Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,7 +766,9 @@ def test_quantile_empty_no_columns(self, interp_method):
expected.columns.name = "captain tightpants"
tm.assert_frame_equal(result, expected)

def test_quantile_item_cache(self, using_array_manager, interp_method):
def test_quantile_item_cache(
self, using_array_manager, interp_method, using_copy_on_write
):
# previous behavior incorrect retained an invalid _item_cache entry
interpolation, method = interp_method
df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"])
Expand All @@ -776,9 +778,15 @@ def test_quantile_item_cache(self, using_array_manager, interp_method):
assert len(df._mgr.blocks) == 2

df.quantile(numeric_only=False, interpolation=interpolation, method=method)
ser.values[0] = 99

assert df.iloc[0, 0] == df["A"][0]
if using_copy_on_write:
ser.iloc[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] != 99
else:
ser.values[0] = 99
assert df.iloc[0, 0] == df["A"][0]
assert df.iloc[0, 0] == 99

def test_invalid_method(self):
with pytest.raises(ValueError, match="Invalid method: foo"):
Expand Down

0 comments on commit 68b409b

Please sign in to comment.