From fd866b77553d1a3fa09aa8af0c14a2f47e148459 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 19 Mar 2020 11:09:58 -0700 Subject: [PATCH] BUG: ExtensionBlock.set not setting values inplace --- pandas/core/internals/blocks.py | 52 ++++++++++++------------ pandas/tests/indexing/test_iloc.py | 11 +++++ pandas/tests/internals/test_internals.py | 17 ++++++++ 3 files changed, 55 insertions(+), 25 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index adeb1ae04a58d..ce9e4d2db85fa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,6 +11,7 @@ import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare +from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -349,11 +350,12 @@ def iget(self, i): def set(self, locs, values): """ - Modify Block in-place with new item value + Modify block values in-place with new item value. - Returns - ------- - None + Notes + ----- + `set` never creates a new array or new Block, whereas `setitem` _may_ + create a new array and always creates a new Block. """ self.values[locs] = values @@ -802,7 +804,7 @@ def _replace_single(self, *args, **kwargs): def setitem(self, indexer, value): """ - Set the value inplace, returning a a maybe different typed block. + Attempt self.values[indexer] = value, possibly creating a new array. Parameters ---------- @@ -1644,12 +1646,15 @@ def iget(self, col): raise IndexError(f"{self} only contains one item") return self.values - def should_store(self, value): + def should_store(self, value: ArrayLike) -> bool: + """ + Can we set the given array-like value inplace? + """ return isinstance(value, self._holder) - def set(self, locs, values, check=False): + def set(self, locs, values): assert locs.tolist() == [0] - self.values = values + self.values[:] = values def putmask( self, mask, new, align=True, inplace=False, axis=0, transpose=False, @@ -1760,7 +1765,7 @@ def is_numeric(self): def setitem(self, indexer, value): """ - Set the value inplace, returning a same-typed block. + Attempt self.values[indexer] = value, possibly creating a new array. This differs from Block.setitem by not allowing setitem to change the dtype of the Block. @@ -2070,7 +2075,7 @@ def to_native_types( ) return formatter.get_result_as_array() - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: # when inserting a column should not coerce integers to floats # unnecessarily return issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype @@ -2088,7 +2093,7 @@ def _can_hold_element(self, element: Any) -> bool: element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return issubclass(value.dtype.type, np.complexfloating) @@ -2107,7 +2112,7 @@ def _can_hold_element(self, element: Any) -> bool: ) return is_integer(element) - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return is_integer_dtype(value) and value.dtype == self.dtype @@ -2118,6 +2123,9 @@ class DatetimeLikeBlockMixin: def _holder(self): return DatetimeArray + def should_store(self, value): + return is_dtype_equal(self.dtype, value.dtype) + @property def fill_value(self): return np.datetime64("NaT", "ns") @@ -2254,16 +2262,9 @@ def to_native_types( ).reshape(i8values.shape) return np.atleast_2d(result) - def should_store(self, value) -> bool: - return is_datetime64_dtype(value.dtype) - def set(self, locs, values): """ - Modify Block in-place with new item value - - Returns - ------- - None + See Block.set.__doc__ """ values = conversion.ensure_datetime64ns(values, copy=False) @@ -2287,6 +2288,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types fill_value = np.datetime64("NaT", "ns") + should_store = DatetimeBlock.should_store @property def _holder(self): @@ -2496,9 +2498,6 @@ def fillna(self, value, **kwargs): ) return super().fillna(value, **kwargs) - def should_store(self, value) -> bool: - return is_timedelta64_dtype(value.dtype) - def to_native_types(self, slicer=None, na_rep=None, quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values @@ -2540,7 +2539,7 @@ def _can_hold_element(self, element: Any) -> bool: return issubclass(tipo.type, np.bool_) return isinstance(element, (bool, np.bool_)) - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return issubclass(value.dtype.type, np.bool_) and not is_extension_array_dtype( value ) @@ -2632,7 +2631,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] def _can_hold_element(self, element: Any) -> bool: return True - def should_store(self, value) -> bool: + def should_store(self, value: ArrayLike) -> bool: return not ( issubclass( value.dtype.type, @@ -2881,6 +2880,9 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return Categorical + def should_store(self, arr: ArrayLike): + return isinstance(arr, self._holder) and is_dtype_equal(self.dtype, arr.dtype) + def to_native_types(self, slicer=None, na_rep="", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ values = self.values diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index f6b9e9a44ba14..9664f8d7212ad 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -694,6 +694,17 @@ def test_series_indexing_zerodim_np_array(self): result = s.iloc[np.array(0)] assert result == 1 + def test_iloc_setitem_categorical_updates_inplace(self): + # Mixed dtype ensures we go through take_split_path in setitem_with_indexer + cat = pd.Categorical(["A", "B", "C"]) + df = pd.DataFrame({1: cat, 2: [1, 2, 3]}) + + # This should modify our original values in-place + df.iloc[:, 0] = cat[::-1] + + expected = pd.Categorical(["C", "B", "A"]) + tm.assert_categorical_equal(cat, expected) + class TestILocSetItemDuplicateColumns: def test_iloc_setitem_scalar_duplicate_columns(self): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 1a7d5839d9a11..b97bca9bbdd4f 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1154,6 +1154,23 @@ def test_binop_other(self, op, value, dtype): tm.assert_series_equal(result, expected) +class TestShouldStore: + def test_should_store_categorical(self): + cat = pd.Categorical(["A", "B", "C"]) + df = pd.DataFrame(cat) + blk = df._data.blocks[0] + + # matching dtype + assert blk.should_store(cat) + assert blk.should_store(cat[:-1]) + + # different dtype + assert not blk.should_store(cat.as_ordered()) + + # ndarray instead of Categorical + assert not blk.should_store(np.asarray(cat)) + + @pytest.mark.parametrize( "typestr, holder", [