Skip to content

Commit

Permalink
Backport PR #51411 on branch 2.0.x (ENH: Optimize CoW for fillna with…
Browse files Browse the repository at this point in the history
… ea dtypes) (#51639)

Backport PR #51411: ENH: Optimize CoW for fillna with ea dtypes

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl committed Feb 26, 2023
1 parent 7cc1791 commit 608c2d8
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 45 deletions.
11 changes: 8 additions & 3 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1758,9 +1758,14 @@ def fillna(
downcast=downcast,
using_cow=using_cow,
)
new_values = self.values.fillna(value=value, method=None, limit=limit)
nb = self.make_block_same_class(new_values)
return nb._maybe_downcast([nb], downcast)
if using_cow and self._can_hold_na and not self.values._hasna:
refs = self.refs
new_values = self.values
else:
refs = None
new_values = self.values.fillna(value=value, method=None, limit=limit)
nb = self.make_block_same_class(new_values, refs=refs)
return nb._maybe_downcast([nb], downcast, using_cow=using_cow)

@cache_readonly
def shape(self) -> Shape:
Expand Down
46 changes: 46 additions & 0 deletions pandas/tests/copy_view/test_interp_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pytest

from pandas import (
NA,
DataFrame,
Interval,
NaT,
Expand Down Expand Up @@ -271,3 +272,48 @@ def test_fillna_series_empty_arg_inplace(using_copy_on_write):
assert np.shares_memory(get_array(ser), arr)
if using_copy_on_write:
assert ser._mgr._has_no_reference(0)


def test_fillna_ea_noop_shares_memory(
using_copy_on_write, any_numeric_ea_and_arrow_dtype
):
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
df_orig = df.copy()
df2 = df.fillna(100)

assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not df2._mgr._has_no_reference(1)
else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))

tm.assert_frame_equal(df_orig, df)

df2.iloc[0, 1] = 100
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert df2._mgr._has_no_reference(1)
assert df._mgr._has_no_reference(1)
tm.assert_frame_equal(df_orig, df)


def test_fillna_inplace_ea_noop_shares_memory(
using_copy_on_write, any_numeric_ea_and_arrow_dtype
):
df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
df_orig = df.copy()
view = df[:]
df.fillna(100, inplace=True)

assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
assert not df._mgr._has_no_reference(1)
assert not view._mgr._has_no_reference(1)
else:
assert not np.shares_memory(get_array(df, "b"), get_array(view, "b"))
df.iloc[0, 1] = 100
tm.assert_frame_equal(df_orig, view)
15 changes: 8 additions & 7 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,24 +243,25 @@ def test_factorize_empty(self, data):
def test_fillna_copy_frame(self, data_missing):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr})
df_orig = df.copy()

filled_val = df.iloc[0, 0]
result = df.fillna(filled_val)

assert df.A.values is not result.A.values
result.iloc[0, 0] = filled_val

def test_fillna_copy_series(self, data_missing, no_op_with_cow: bool = False):
self.assert_frame_equal(df, df_orig)

def test_fillna_copy_series(self, data_missing):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)
ser_orig = ser.copy()

filled_val = ser[0]
result = ser.fillna(filled_val)
result.iloc[0] = filled_val

if no_op_with_cow:
assert ser._values is result._values
else:
assert ser._values is not result._values
assert ser._values is arr
self.assert_series_equal(ser, ser_orig)

def test_fillna_length_mismatch(self, data_missing):
msg = "Length of 'value' does not match."
Expand Down
13 changes: 12 additions & 1 deletion pandas/tests/extension/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@

import pytest

from pandas import Series
from pandas import (
Series,
options,
)


@pytest.fixture
Expand Down Expand Up @@ -193,3 +196,11 @@ def invalid_scalar(data):
If the array can hold any item (i.e. object dtype), then use pytest.skip.
"""
return object.__new__(object)


@pytest.fixture
def using_copy_on_write() -> bool:
"""
Fixture to check if Copy-on-Write is enabled.
"""
return options.mode.copy_on_write and options.mode.data_manager == "block"
14 changes: 3 additions & 11 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,17 +301,9 @@ def test_searchsorted(self, data_for_sorting):
def test_equals(self, data, na_value, as_series):
super().test_equals(data, na_value, as_series)

def test_fillna_copy_frame(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr})

filled_val = df.iloc[0, 0]
result = df.fillna(filled_val)

if using_copy_on_write:
assert df.A.values is result.A.values
else:
assert df.A.values is not result.A.values
@pytest.mark.skip("fill-value is interpreted as a dict of values")
def test_fillna_copy_frame(self, data_missing):
super().test_fillna_copy_frame(data_missing)


class TestCasting(BaseJSON, base.BaseCastingTests):
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/extension/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,6 @@ def test_combine_add(self, data_repeated):
# Timestamp.__add__(Timestamp) not defined
pass

def test_fillna_copy_series(self, data_missing, using_copy_on_write):
super().test_fillna_copy_series(
data_missing, no_op_with_cow=using_copy_on_write
)


class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests):
pass
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/extension/test_interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,6 @@ def test_combine_add(self, data_repeated):
def test_fillna_length_mismatch(self, data_missing):
super().test_fillna_length_mismatch(data_missing)

def test_fillna_copy_series(self, data_missing, using_copy_on_write):
super().test_fillna_copy_series(
data_missing, no_op_with_cow=using_copy_on_write
)


class TestMissing(BaseInterval, base.BaseMissingTests):
# Index.fillna only accepts scalar `value`, so we have to xfail all
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/extension/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ def test_diff(self, data, periods):
else:
super().test_diff(data, periods)

def test_fillna_copy_series(self, data_missing, using_copy_on_write):
super().test_fillna_copy_series(
data_missing, no_op_with_cow=using_copy_on_write
)


class TestInterface(BasePeriodTests, base.BaseInterfaceTests):
pass
Expand Down
15 changes: 11 additions & 4 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,25 +272,32 @@ def test_fillna_frame(self, data_missing):
class TestMethods(BaseSparseTests, base.BaseMethodsTests):
_combine_le_expected_dtype = "Sparse[bool]"

def test_fillna_copy_frame(self, data_missing):
def test_fillna_copy_frame(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
df = pd.DataFrame({"A": arr}, copy=False)

filled_val = df.iloc[0, 0]
result = df.fillna(filled_val)

if hasattr(df._mgr, "blocks"):
assert df.values.base is not result.values.base
if using_copy_on_write:
assert df.values.base is result.values.base
else:
assert df.values.base is not result.values.base
assert df.A._values.to_dense() is arr.to_dense()

def test_fillna_copy_series(self, data_missing):
def test_fillna_copy_series(self, data_missing, using_copy_on_write):
arr = data_missing.take([1, 1])
ser = pd.Series(arr)

filled_val = ser[0]
result = ser.fillna(filled_val)

assert ser._values is not result._values
if using_copy_on_write:
assert ser._values is result._values

else:
assert ser._values is not result._values
assert ser._values.to_dense() is arr.to_dense()

@pytest.mark.xfail(reason="Not Applicable")
Expand Down
14 changes: 10 additions & 4 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,9 @@ def test_groupby_raises_datetime_np(how, by, groupby_series, groupby_func_np):


@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_category(how, by, groupby_series, groupby_func):
def test_groupby_raises_category(
how, by, groupby_series, groupby_func, using_copy_on_write
):
# GH#50749
df = DataFrame(
{
Expand Down Expand Up @@ -370,7 +372,9 @@ def test_groupby_raises_category(how, by, groupby_series, groupby_func):
TypeError,
r"Cannot setitem on a Categorical with a new category \(0\), "
+ "set the categories first",
),
)
if not using_copy_on_write
else (None, ""), # no-op with CoW
"first": (None, ""),
"idxmax": (None, ""),
"idxmin": (None, ""),
Expand Down Expand Up @@ -491,7 +495,7 @@ def test_groupby_raises_category_np(how, by, groupby_series, groupby_func_np):

@pytest.mark.parametrize("how", ["method", "agg", "transform"])
def test_groupby_raises_category_on_category(
how, by, groupby_series, groupby_func, observed
how, by, groupby_series, groupby_func, observed, using_copy_on_write
):
# GH#50749
df = DataFrame(
Expand Down Expand Up @@ -562,7 +566,9 @@ def test_groupby_raises_category_on_category(
TypeError,
r"Cannot setitem on a Categorical with a new category \(0\), "
+ "set the categories first",
),
)
if not using_copy_on_write
else (None, ""), # no-op with CoW
"first": (None, ""),
"idxmax": (ValueError, "attempt to get argmax of an empty sequence")
if empty_groups
Expand Down

0 comments on commit 608c2d8

Please sign in to comment.