Backport PR #51082 on branch 2.0.x (API / CoW: return read-only numpy…

… arrays in .values/to_numpy()) (#51933) Backport PR #51082: API / CoW: return read-only numpy arrays in .values/to_numpy() Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
pandas-dev · Mar 13, 2023 · 68b409b · 68b409b
1 parent 0bbc9f5
commit 68b409b
Show file tree

Hide file tree

Showing 26 changed files with 274 additions and 53 deletions.
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -20,6 +20,8 @@
 
 import numpy as np
 
+from pandas._config import using_copy_on_write
+
 from pandas._libs import lib
 from pandas._typing import (
     Axis,
@@ -545,10 +547,16 @@ def to_numpy(
 
         result = np.asarray(values, dtype=dtype)
 
-        if copy and na_value is lib.no_default:
+        if (copy and na_value is lib.no_default) or (
+            not copy and using_copy_on_write()
+        ):
             if np.shares_memory(self._values[:2], result[:2]):
                 # Take slices to improve performance of check
-                result = result.copy()
+                if using_copy_on_write() and not copy:
+                    result = result.view()
+                    result.flags.writeable = False
+                else:
+                    result = result.copy()
 
         return result
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1989,7 +1989,13 @@ def empty(self) -> bool_t:
     __array_priority__: int = 1000
 
     def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
-        return np.asarray(self._values, dtype=dtype)
+        values = self._values
+        arr = np.asarray(values, dtype=dtype)
+        if arr is values and using_copy_on_write():
+            # TODO(CoW) also properly handle extension dtypes
+            arr = arr.view()
+            arr.flags.writeable = False
+        return arr
 
     @final
     def __array_ufunc__(

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -14,6 +14,8 @@
 
 import numpy as np
 
+from pandas._config import using_copy_on_write
+
 from pandas._libs import (
     internals as libinternals,
     lib,
@@ -2592,6 +2594,12 @@ def external_values(values: ArrayLike) -> ArrayLike:
         # NB: for datetime64tz this is different from np.asarray(values), since
         #  that returns an object-dtype ndarray of Timestamps.
         # Avoid raising in .astype in casting from dt64tz to dt64
-        return values._ndarray
-    else:
-        return values
+        values = values._ndarray
+
+    if isinstance(values, np.ndarray) and using_copy_on_write():
+        values = values.view()
+        values.flags.writeable = False
+
+    # TODO(CoW) we should also mark our ExtensionArrays as read-only
+
+    return values
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1719,13 +1719,16 @@ def as_array(
                 arr = np.asarray(blk.get_values())
                 if dtype:
                     arr = arr.astype(dtype, copy=False)
+
+            if copy:
+                arr = arr.copy()
+            elif using_copy_on_write():
+                arr = arr.view()
+                arr.flags.writeable = False
         else:
             arr = self._interleave(dtype=dtype, na_value=na_value)
-            # The underlying data was copied within _interleave
-            copy = False
-
-        if copy:
-            arr = arr.copy()
+            # The underlying data was copied within _interleave, so no need
+            # to further copy if copy=True or setting na_value
 
         if na_value is not lib.no_default:
             arr[isna(arr)] = na_value

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -897,7 +897,13 @@ def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
         array(['1999-12-31T23:00:00.000000000', ...],
               dtype='datetime64[ns]')
         """
-        return np.asarray(self._values, dtype)
+        values = self._values
+        arr = np.asarray(values, dtype=dtype)
+        if arr is values and using_copy_on_write():
+            # TODO(CoW) also properly handle extension dtypes
+            arr = arr.view()
+            arr.flags.writeable = False
+        return arr
 
     # ----------------------------------------------------------------------
     # Unary Methods

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -1125,7 +1125,7 @@ def converter(*date_cols, col: Hashable):
                 dayfirst=dayfirst,
                 errors="ignore",
                 cache=cache_dates,
-            ).to_numpy()
+            )._values
         else:
             try:
                 result = tools.to_datetime(

diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
@@ -0,0 +1,112 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+# -----------------------------------------------------------------------------
+# Copy/view behaviour for accessing underlying array of Series/DataFrame
+
+
+@pytest.mark.parametrize(
+    "method",
+    [lambda ser: ser.values, lambda ser: np.asarray(ser)],
+    ids=["values", "asarray"],
+)
+def test_series_values(using_copy_on_write, method):
+    ser = Series([1, 2, 3], name="name")
+    ser_orig = ser.copy()
+
+    arr = method(ser)
+
+    if using_copy_on_write:
+        # .values still gives a view but is read-only
+        assert np.shares_memory(arr, get_array(ser, "name"))
+        assert arr.flags.writeable is False
+
+        # mutating series through arr therefore doesn't work
+        with pytest.raises(ValueError, match="read-only"):
+            arr[0] = 0
+        tm.assert_series_equal(ser, ser_orig)
+
+        # mutating the series itself still works
+        ser.iloc[0] = 0
+        assert ser.values[0] == 0
+    else:
+        assert arr.flags.writeable is True
+        arr[0] = 0
+        assert ser.iloc[0] == 0
+
+
+@pytest.mark.parametrize(
+    "method",
+    [lambda df: df.values, lambda df: np.asarray(df)],
+    ids=["values", "asarray"],
+)
+def test_dataframe_values(using_copy_on_write, using_array_manager, method):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df_orig = df.copy()
+
+    arr = method(df)
+
+    if using_copy_on_write:
+        # .values still gives a view but is read-only
+        assert np.shares_memory(arr, get_array(df, "a"))
+        assert arr.flags.writeable is False
+
+        # mutating series through arr therefore doesn't work
+        with pytest.raises(ValueError, match="read-only"):
+            arr[0, 0] = 0
+        tm.assert_frame_equal(df, df_orig)
+
+        # mutating the series itself still works
+        df.iloc[0, 0] = 0
+        assert df.values[0, 0] == 0
+    else:
+        assert arr.flags.writeable is True
+        arr[0, 0] = 0
+        if not using_array_manager:
+            assert df.iloc[0, 0] == 0
+        else:
+            tm.assert_frame_equal(df, df_orig)
+
+
+def test_series_to_numpy(using_copy_on_write):
+    ser = Series([1, 2, 3], name="name")
+    ser_orig = ser.copy()
+
+    # default: copy=False, no dtype or NAs
+    arr = ser.to_numpy()
+    if using_copy_on_write:
+        # to_numpy still gives a view but is read-only
+        assert np.shares_memory(arr, get_array(ser, "name"))
+        assert arr.flags.writeable is False
+
+        # mutating series through arr therefore doesn't work
+        with pytest.raises(ValueError, match="read-only"):
+            arr[0] = 0
+        tm.assert_series_equal(ser, ser_orig)
+
+        # mutating the series itself still works
+        ser.iloc[0] = 0
+        assert ser.values[0] == 0
+    else:
+        assert arr.flags.writeable is True
+        arr[0] = 0
+        assert ser.iloc[0] == 0
+
+    # specify copy=False gives a writeable array
+    ser = Series([1, 2, 3], name="name")
+    arr = ser.to_numpy(copy=True)
+    assert not np.shares_memory(arr, get_array(ser, "name"))
+    assert arr.flags.writeable is True
+
+    # specifying a dtype that already causes a copy also gives a writeable array
+    ser = Series([1, 2, 3], name="name")
+    arr = ser.to_numpy(dtype="float64")
+    assert not np.shares_memory(arr, get_array(ser, "name"))
+    assert arr.flags.writeable is True
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -345,7 +345,7 @@ def test_setitem2(self):
 
     def test_setitem_boolean(self, float_frame):
         df = float_frame.copy()
-        values = float_frame.values
+        values = float_frame.values.copy()
 
         df[df["A"] > 0] = 4
         values[values[:, 0] > 0] = 4
@@ -381,16 +381,18 @@ def test_setitem_boolean(self, float_frame):
             df[df * 0] = 2
 
         # index with DataFrame
+        df_orig = df.copy()
         mask = df > np.abs(df)
-        expected = df.copy()
         df[df > np.abs(df)] = np.nan
-        expected.values[mask.values] = np.nan
+        values = df_orig.values.copy()
+        values[mask.values] = np.nan
+        expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
         tm.assert_frame_equal(df, expected)
 
         # set from DataFrame
-        expected = df.copy()
         df[df > np.abs(df)] = df * 2
-        np.putmask(expected.values, mask.values, df.values * 2)
+        np.putmask(values, mask.values, df.values * 2)
+        expected = DataFrame(values, index=df_orig.index, columns=df_orig.columns)
         tm.assert_frame_equal(df, expected)
 
     def test_setitem_cast(self, float_frame):
@@ -664,16 +666,20 @@ def test_setitem_fancy_boolean(self, float_frame):
         # from 2d, set with booleans
         frame = float_frame.copy()
         expected = float_frame.copy()
+        values = expected.values.copy()
 
         mask = frame["A"] > 0
         frame.loc[mask] = 0.0
-        expected.values[mask.values] = 0.0
+        values[mask.values] = 0.0
+        expected = DataFrame(values, index=expected.index, columns=expected.columns)
         tm.assert_frame_equal(frame, expected)
 
         frame = float_frame.copy()
         expected = float_frame.copy()
+        values = expected.values.copy()
         frame.loc[mask, ["A", "B"]] = 0.0
-        expected.values[mask.values, :2] = 0.0
+        values[mask.values, :2] = 0.0
+        expected = DataFrame(values, index=expected.index, columns=expected.columns)
         tm.assert_frame_equal(frame, expected)
 
     def test_getitem_fancy_ints(self, float_frame):

diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py
@@ -71,7 +71,7 @@ def test_insert_with_columns_dups(self):
         )
         tm.assert_frame_equal(df, exp)
 
-    def test_insert_item_cache(self, using_array_manager):
+    def test_insert_item_cache(self, using_array_manager, using_copy_on_write):
         df = DataFrame(np.random.randn(4, 3))
         ser = df[0]
 
@@ -85,9 +85,14 @@ def test_insert_item_cache(self, using_array_manager):
             for n in range(100):
                 df[n + 3] = df[1] * n
 
-        ser.values[0] = 99
-
-        assert df.iloc[0, 0] == df[0][0]
+        if using_copy_on_write:
+            ser.iloc[0] = 99
+            assert df.iloc[0, 0] == df[0][0]
+            assert df.iloc[0, 0] != 99
+        else:
+            ser.values[0] = 99
+            assert df.iloc[0, 0] == df[0][0]
+            assert df.iloc[0, 0] == 99
 
     def test_insert_EA_no_warning(self):
         # PerformanceWarning about fragmented frame should not be raised when

diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -1002,8 +1002,9 @@ def test_setitem_boolean_mask(self, mask_type, float_frame):
         result = df.copy()
         result[mask] = np.nan
 
-        expected = df.copy()
-        expected.values[np.array(mask)] = np.nan
+        expected = df.values.copy()
+        expected[np.array(mask)] = np.nan
+        expected = DataFrame(expected, index=df.index, columns=df.columns)
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.xfail(reason="Currently empty indexers are treated as all False")

diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
@@ -982,7 +982,7 @@ def test_where_dt64_2d():
 
     df = DataFrame(dta, columns=["A", "B"])
 
-    mask = np.asarray(df.isna())
+    mask = np.asarray(df.isna()).copy()
     mask[:, 1] = True
 
     # setting all of one column, none of the other

diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py
@@ -18,6 +18,7 @@ def test_copy_index_name_checking(self, float_frame, attr):
         getattr(cp, attr).name = "foo"
         assert getattr(float_frame, attr).name is None
 
+    @td.skip_copy_on_write_invalid_test
     def test_copy_cache(self):
         # GH#31784 _item_cache not cleared on copy causes incorrect reads after updates
         df = DataFrame({"a": [1]})

diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py
@@ -417,7 +417,7 @@ def test_join(self, multiindex_dataframe_random_data):
         b = frame.loc[frame.index[2:], ["B", "C"]]
 
         joined = a.join(b, how="outer").reindex(frame.index)
-        expected = frame.copy().values
+        expected = frame.copy().values.copy()
         expected[np.isnan(joined.values)] = np.nan
         expected = DataFrame(expected, index=frame.index, columns=frame.columns)
 

diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
@@ -766,7 +766,9 @@ def test_quantile_empty_no_columns(self, interp_method):
         expected.columns.name = "captain tightpants"
         tm.assert_frame_equal(result, expected)
 
-    def test_quantile_item_cache(self, using_array_manager, interp_method):
+    def test_quantile_item_cache(
+        self, using_array_manager, interp_method, using_copy_on_write
+    ):
         # previous behavior incorrect retained an invalid _item_cache entry
         interpolation, method = interp_method
         df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"])
@@ -776,9 +778,15 @@ def test_quantile_item_cache(self, using_array_manager, interp_method):
             assert len(df._mgr.blocks) == 2
 
         df.quantile(numeric_only=False, interpolation=interpolation, method=method)
-        ser.values[0] = 99
 
-        assert df.iloc[0, 0] == df["A"][0]
+        if using_copy_on_write:
+            ser.iloc[0] = 99
+            assert df.iloc[0, 0] == df["A"][0]
+            assert df.iloc[0, 0] != 99
+        else:
+            ser.values[0] = 99
+            assert df.iloc[0, 0] == df["A"][0]
+            assert df.iloc[0, 0] == 99
 
     def test_invalid_method(self):
         with pytest.raises(ValueError, match="Invalid method: foo"):