diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 288f84c5d35d4..2a718fdcf16e7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -531,6 +531,7 @@ Indexing - Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`) - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) - Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`) +- Bug in :meth:`DataFrame.__setitem__` incorrectly writing into an existing column's array rather than setting a new array when the new dtype and the old dtype match (:issue:`43406`) - Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`) - Bug in :meth:`DataFrame.sort_index` where ``ignore_index=True`` was not being respected when the index was already sorted (:issue:`43591`) - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 565c153603b86..5c24c57925393 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3840,9 +3840,11 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None: arraylike = _reindex_for_setitem(value, self.index) self._set_item_mgr(key, arraylike) - def _iset_item_mgr(self, loc: int | slice | np.ndarray, value) -> None: + def _iset_item_mgr( + self, loc: int | slice | np.ndarray, value, inplace: bool = False + ) -> None: # when called from _set_item_mgr loc can be anything returned from get_loc - self._mgr.iset(loc, value) + self._mgr.iset(loc, value, inplace=inplace) self._clear_item_cache() def _set_item_mgr(self, key, value: ArrayLike) -> None: @@ -3860,9 +3862,9 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None: if len(self): self._check_setitem_copy() - def _iset_item(self, loc: int, value) -> None: + def _iset_item(self, loc: int, value, inplace: bool = False) -> None: arraylike = self._sanitize_column(value) - self._iset_item_mgr(loc, arraylike) + self._iset_item_mgr(loc, arraylike, inplace=inplace) # check if we are modifying a copy # try to set first as we want an invalid @@ -3994,13 +3996,13 @@ def _reset_cacher(self) -> None: # no-op for DataFrame pass - def _maybe_cache_changed(self, item, value: Series) -> None: + def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: """ The object has called back to us saying maybe it has changed. """ loc = self._info_axis.get_loc(item) arraylike = value._values - self._mgr.iset(loc, arraylike) + self._mgr.iset(loc, arraylike, inplace=inplace) # ---------------------------------------------------------------------- # Unsorted diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dee71a4f000e5..f3d7d6cee5446 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3531,7 +3531,10 @@ def _reset_cacher(self) -> None: raise AbstractMethodError(self) def _maybe_update_cacher( - self, clear: bool_t = False, verify_is_copy: bool_t = True + self, + clear: bool_t = False, + verify_is_copy: bool_t = True, + inplace: bool_t = False, ) -> None: """ See if we need to update our parent cacher if clear, then clear our diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 918a9ea1b8030..669274e034905 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1862,10 +1862,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): # set the item, possibly having a dtype change ser = ser.copy() ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value) - ser._maybe_update_cacher(clear=True) + ser._maybe_update_cacher(clear=True, inplace=True) # reset the sliced object if unique - self.obj._iset_item(loc, ser) + self.obj._iset_item(loc, ser, inplace=True) def _setitem_single_block(self, indexer, value, name: str): """ @@ -1890,9 +1890,10 @@ def _setitem_single_block(self, indexer, value, name: str): if i != info_axis ) ): - selected_item_labels = item_labels[indexer[info_axis]] - if len(item_labels.get_indexer_for([selected_item_labels])) == 1: - self.obj[selected_item_labels] = value + col = item_labels[indexer[info_axis]] + if len(item_labels.get_indexer_for([col])) == 1: + loc = item_labels.get_loc(col) + self.obj._iset_item(loc, value, inplace=True) return indexer = maybe_convert_ix(*indexer) @@ -1910,7 +1911,7 @@ def _setitem_single_block(self, indexer, value, name: str): # actually do the set self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) - self.obj._maybe_update_cacher(clear=True) + self.obj._maybe_update_cacher(clear=True, inplace=True) def _setitem_with_indexer_missing(self, indexer, value): """ diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index d1802afb7a2f1..7f728ac9ddae5 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -792,7 +792,9 @@ def column_arrays(self) -> list[ArrayLike]: """ return self.arrays - def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): + def iset( + self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False + ): """ Set new column(s). @@ -804,6 +806,8 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): loc : integer, slice or boolean mask Positional location (already bounds checked) value : np.ndarray or ExtensionArray + inplace : bool, default False + Whether overwrite existing array as opposed to replacing it. """ # single column -> single integer index if lib.is_integer(loc): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 745cddee93479..9286238e81fc3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1032,7 +1032,9 @@ def column_arrays(self) -> list[np.ndarray]: # expected "List[ndarray[Any, Any]]") return result # type: ignore[return-value] - def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): + def iset( + self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False + ): """ Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items @@ -1087,7 +1089,7 @@ def value_getitem(placement): for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] - if blk.should_store(value): + if inplace and blk.should_store(value): blk.set_inplace(blk_locs, value_getitem(val_locs)) else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0aaca406df9b4..b67f16008bb13 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1226,7 +1226,7 @@ def _check_is_chained_assignment_possible(self) -> bool: return super()._check_is_chained_assignment_possible() def _maybe_update_cacher( - self, clear: bool = False, verify_is_copy: bool = True + self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False ) -> None: """ See NDFrame._maybe_update_cacher.__doc__ @@ -1244,13 +1244,15 @@ def _maybe_update_cacher( # GH#42530 self.name must be in ref.columns # to ensure column still in dataframe # otherwise, either self or ref has swapped in new arrays - ref._maybe_cache_changed(cacher[0], self) + ref._maybe_cache_changed(cacher[0], self, inplace=inplace) else: # GH#33675 we have swapped in a new array, so parent # reference to self is now invalid ref._item_cache.pop(cacher[0], None) - super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy) + super()._maybe_update_cacher( + clear=clear, verify_is_copy=verify_is_copy, inplace=inplace + ) # ---------------------------------------------------------------------- # Unsorted diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 74460a75d7b63..942da38dc5a26 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -537,6 +537,7 @@ def test_getitem_setitem_integer_slice_keyerrors(self): @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): + sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 @@ -544,9 +545,11 @@ def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): # setting it triggers setting with copy sliced = float_frame.iloc[:, -3:] + assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values) + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - sliced["C"] = 4.0 + sliced.loc[:, "C"] = 4.0 assert (float_frame["C"] == 4).all() @@ -1002,9 +1005,11 @@ def test_iloc_row_slice_view(self, using_array_manager): # setting it makes it raise/warn subset = df.iloc[slice(4, 8)] + assert np.shares_memory(df[2], subset[2]) + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - subset[2] = 0.0 + subset.loc[:, 2] = 0.0 exp_col = original[2].copy() # TODO(ArrayManager) verify it is expected that the original didn't change @@ -1041,10 +1046,13 @@ def test_iloc_col_slice_view(self, using_array_manager): if not using_array_manager: # verify slice is view + + assert np.shares_memory(df[8]._values, subset[8]._values) + # and that we are setting a copy msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" with pytest.raises(com.SettingWithCopyError, match=msg): - subset[8] = 0.0 + subset.loc[:, 8] = 0.0 assert (df[8] == 0).all() else: diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 821aed535297b..d735f0dbec8a5 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1028,12 +1028,6 @@ def test_setitem_duplicate_columns_not_inplace(self): ) def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request): # GH#39510 - if not using_array_manager: - mark = pytest.mark.xfail( - reason="Setitem with same dtype still changing inplace" - ) - request.node.add_marker(mark) - cols = ["A", "B"] df = DataFrame(0, index=[0, 1], columns=cols) df_copy = df.copy() @@ -1056,3 +1050,34 @@ def test_setitem_listlike_key_scalar_value_not_inplace(self, value): expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols) tm.assert_frame_equal(df_view, df_copy) tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "indexer", + [ + "a", + ["a"], + pytest.param( + [True, False], + marks=pytest.mark.xfail( + reason="Boolean indexer incorrectly setting inplace", + strict=False, # passing on some builds, no obvious pattern + ), + ), + ], + ) + @pytest.mark.parametrize( + "value, set_value", + [ + (1, 5), + (1.0, 5.0), + (Timestamp("2020-12-31"), Timestamp("2021-12-31")), + ("a", "b"), + ], + ) + def test_setitem_not_operating_inplace(self, value, set_value, indexer): + # GH#43406 + df = DataFrame({"a": value}, index=[0, 1]) + expected = df.copy() + view = df[:] + df[indexer] = set_value + tm.assert_frame_equal(view, expected) diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index 462d588aff58f..26ecf1356a946 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -173,7 +173,10 @@ def test_rename_multiindex(self): @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view def test_rename_nocopy(self, float_frame): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) - renamed["foo"] = 1.0 + + assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) + + renamed.loc[:, "foo"] = 1.0 assert (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 6484ac1f8a8e2..046d349b92f3f 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -824,20 +824,24 @@ def test_iloc_empty_list_indexer_is_ok(self): df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_array_manager): + def test_identity_slice_returns_new_object(self, using_array_manager, request): # GH13873 + if using_array_manager: + mark = pytest.mark.xfail( + reason="setting with .loc[:, 'a'] does not alter inplace" + ) + request.node.add_marker(mark) + original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.iloc[:] assert sliced_df is not original_df # should be a shallow copy - original_df["a"] = [4, 4, 4] - if using_array_manager: - # TODO(ArrayManager) verify it is expected that the original didn't change - # setitem is replacing full column, so doesn't update "viewing" dataframe - assert not (sliced_df["a"] == 4).all() - else: - assert (sliced_df["a"] == 4).all() + assert np.shares_memory(original_df["a"], sliced_df["a"]) + + # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig + original_df.loc[:, "a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index cf2a4a75f95b5..bc08c53784e76 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1002,21 +1002,25 @@ def test_loc_empty_list_indexer_is_ok(self): df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_array_manager): + def test_identity_slice_returns_new_object(self, using_array_manager, request): # GH13873 + if using_array_manager: + mark = pytest.mark.xfail( + reason="setting with .loc[:, 'a'] does not alter inplace" + ) + request.node.add_marker(mark) + original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.loc[:] assert sliced_df is not original_df assert original_df[:] is not original_df # should be a shallow copy - original_df["a"] = [4, 4, 4] - if using_array_manager: - # TODO(ArrayManager) verify it is expected that the original didn't change - # setitem is replacing full column, so doesn't update "viewing" dataframe - assert not (sliced_df["a"] == 4).all() - else: - assert (sliced_df["a"] == 4).all() + assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values) + + # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig + original_df.loc[:, "a"] = [4, 4, 4] + assert (sliced_df["a"] == 4).all() # These should not return copies assert original_df is original_df.loc[:, :] diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 00635671e459b..54706dc24fc42 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -750,7 +750,11 @@ def test_get_numeric_data(self): ) # Check sharing - numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0])) + numeric.iset( + numeric.items.get_loc("float"), + np.array([100.0, 200.0, 300.0]), + inplace=True, + ) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("float")).internal_values(), np.array([100.0, 200.0, 300.0]), @@ -759,7 +763,9 @@ def test_get_numeric_data(self): numeric2 = mgr.get_numeric_data(copy=True) tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) numeric2.iset( - numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0]) + numeric2.items.get_loc("float"), + np.array([1000.0, 2000.0, 3000.0]), + inplace=True, ) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("float")).internal_values(), @@ -781,7 +787,7 @@ def test_get_bool_data(self): bools.iget(bools.items.get_loc("bool")).internal_values(), ) - bools.iset(0, np.array([True, False, True])) + bools.iset(0, np.array([True, False, True]), inplace=True) tm.assert_numpy_array_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), np.array([True, False, True]), diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e42dcfbe38931..371a7fed543e4 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -308,21 +308,8 @@ def test_merge_nocopy(self, using_array_manager): merged = merge(left, right, left_index=True, right_index=True, copy=False) - if using_array_manager: - # With ArrayManager, setting a column doesn't change the values inplace - # and thus does not propagate the changes to the original left/right - # dataframes -> need to check that no copy was made in a different way - # TODO(ArrayManager) we should be able to simplify this with a .loc - # setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10 - # but this currently replaces the array (_setitem_with_indexer_split_path) - assert merged._mgr.arrays[0] is left._mgr.arrays[0] - assert merged._mgr.arrays[2] is right._mgr.arrays[0] - else: - merged["a"] = 6 - assert (left["a"] == 6).all() - - merged["d"] = "peekaboo" - assert (right["d"] == "peekaboo").all() + assert np.shares_memory(merged["a"]._values, left["a"]._values) + assert np.shares_memory(merged["d"]._values, right["d"]._values) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame