diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e03d10b045824..b95c515831f55 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1330,24 +1330,34 @@ indexing operation, the result will be a copy. With single label / scalar indexing and slicing, e.g. ``df.ix[3:6]`` or ``df.ix[:, 'A']``, a view will be returned. -In chained expressions, the order may determine whether a copy is returned or not: +In chained expressions, the order may determine whether a copy is returned or not. +If an expression will set values on a copy of a slice, then a ``SettingWithCopy`` +exception will be raised (this raise/warn behavior is new starting in 0.13.0) -.. ipython:: python +You can control the action of a chained assignment via the option ``mode.chained_assignment``, +which can take the values ``['raise','warn',None]``, where showing a warning is the default. +.. ipython:: python dfb = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], - 'b' : ['x', 'y', 'y', - 'x', 'y', 'x', 'x'], - 'c' : randn(7)}) - - - # goes to copy (will be lost) - dfb[dfb.a.str.startswith('o')]['c'] = 42 + 'c' : np.arange(7)}) # passed via reference (will stay) dfb['c'][dfb.a.str.startswith('o')] = 42 +This however is operating on a copy and will not work. + +:: + + >>> pd.set_option('mode.chained_assignment','warn') + >>> dfb[dfb.a.str.startswith('o')]['c'] = 42 + Traceback (most recent call last) + ... + SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_index,col_indexer] = value instead + A chained assignment can also crop up in setting in a mixed dtype frame. .. note:: @@ -1359,28 +1369,35 @@ This is the correct access method .. ipython:: python dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) - dfc_copy = dfc.copy() - dfc_copy.loc[0,'A'] = 11 - dfc_copy + dfc.loc[0,'A'] = 11 + dfc This *can* work at times, but is not guaranteed, and so should be avoided .. ipython:: python - dfc_copy = dfc.copy() - dfc_copy['A'][0] = 111 - dfc_copy + dfc = dfc.copy() + dfc['A'][0] = 111 + dfc This will **not** work at all, and so should be avoided -.. ipython:: python +:: + + >>> pd.set_option('mode.chained_assignment','raise') + >>> dfc.loc[0]['A'] = 1111 + Traceback (most recent call last) + ... + SettingWithCopyException: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_index,col_indexer] = value instead + +.. warning:: - dfc_copy = dfc.copy() - dfc_copy.loc[0]['A'] = 1111 - dfc_copy + The chained assignment warnings / exceptions are aiming to inform the user of a possibly invalid + assignment. There may be false positives; situations where a chained assignment is inadvertantly + reported. -When assigning values to subsets of your data, thus, make sure to either use the -pandas access methods or explicitly handle the assignment creating a copy. Fallback indexing ----------------- diff --git a/doc/source/release.rst b/doc/source/release.rst index 926e8f1d0c5ea..2e9654b2131f1 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -396,6 +396,9 @@ API Changes 3 4.000000 dtype: float64 + - raise/warn ``SettingWithCopyError/Warning`` exception/warning when setting of a + copy thru chained assignment is detected, settable via option ``mode.chained_assignment`` + Internal Refactoring ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index c736a52cd1e71..b3f831af35339 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -104,6 +104,34 @@ API changes - ``Series`` and ``DataFrame`` now have a ``mode()`` method to calculate the statistical mode(s) by axis/Series. (:issue:`5367`) +- Chained assignment will now by default warn if the user is assigning to a copy. This can be changed + with he option ``mode.chained_assignment``, allowed options are ``raise/warn/None``. See :ref:`the docs`. + + .. ipython:: python + + dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + pd.set_option('chained_assignment','warn') + + The following warning / exception will show if this is attempted. + + .. ipython:: python + + dfc.loc[0]['A'] = 1111 + + :: + + Traceback (most recent call last) + ... + SettingWithCopyWarning: + A value is trying to be set on a copy of a slice from a DataFrame. + Try using .loc[row_index,col_indexer] = value instead + + Here is the correct method of assignment. + + .. ipython:: python + + dfc.loc[0,'A'] = 11 + dfc Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/common.py b/pandas/core/common.py index e89ae44dacd31..453227aec6e23 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -26,6 +26,11 @@ class PandasError(Exception): pass +class SettingWithCopyError(ValueError): + pass + +class SettingWithCopyWarning(Warning): + pass class AmbiguousIndexError(PandasError, KeyError): pass diff --git a/pandas/core/config.py b/pandas/core/config.py index 9de596142e7e0..20ec30398fd64 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -512,6 +512,13 @@ def _get_root(key): cursor = cursor[p] return cursor, path[-1] +def _get_option_fast(key): + """ internal quick access routine, no error checking """ + path = key.split('.') + cursor = _global_config + for p in path: + cursor = cursor[p] + return cursor def _is_deprecated(key): """ Returns True if the given option has been deprecated """ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 13f7a3dbe7d4a..1275a5463cbe3 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -271,7 +271,6 @@ def mpl_style_cb(key): # We don't want to start importing everything at the global context level # or we'll hit circular deps. - def use_inf_as_null_cb(key): from pandas.core.common import _use_inf_as_null _use_inf_as_null(key) @@ -281,6 +280,17 @@ def use_inf_as_null_cb(key): cb=use_inf_as_null_cb) +# user warnings +chained_assignment = """ +: string + Raise an exception, warn, or no action if trying to use chained assignment, The default is warn +""" + +with cf.config_prefix('mode'): + cf.register_option('chained_assignment', 'warn', chained_assignment, + validator=is_one_of_factory([None, 'warn', 'raise'])) + + # Set up the io.excel specific configuration. writer_engine_doc = """ : string diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e843e40037b1..280203fa65232 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1547,12 +1547,9 @@ def _ixs(self, i, axis=0, copy=False): i = _maybe_convert_indices(i, len(self._get_axis(axis))) return self.reindex(i, takeable=True) else: - try: - new_values = self._data.fast_2d_xs(i, copy=copy) - except: - new_values = self._data.fast_2d_xs(i, copy=True) + new_values, copy = self._data.fast_2d_xs(i, copy=copy) return Series(new_values, index=self.columns, - name=self.index[i]) + name=self.index[i])._setitem_copy(copy) # icol else: @@ -1892,10 +1889,18 @@ def _set_item(self, key, value): Series/TimeSeries will be conformed to the DataFrame's index to ensure homogeneity. """ + + is_existing = key in self.columns self._ensure_valid_index(value) value = self._sanitize_column(key, value) NDFrame._set_item(self, key, value) + # check if we are modifying a copy + # try to set first as we want an invalid + # value exeption to occur first + if is_existing: + self._check_setitem_copy() + def insert(self, loc, column, value, allow_duplicates=False): """ Insert column into DataFrame at specified location. @@ -2093,13 +2098,16 @@ def xs(self, key, axis=0, level=None, copy=True, drop_level=True): new_index = self.index[loc] if np.isscalar(loc): - new_values = self._data.fast_2d_xs(loc, copy=copy) - return Series(new_values, index=self.columns, - name=self.index[loc]) + + new_values, copy = self._data.fast_2d_xs(loc, copy=copy) + result = Series(new_values, index=self.columns, + name=self.index[loc])._setitem_copy(copy) + else: result = self[loc] result.index = new_index - return result + + return result _xs = xs diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b5e526e42a547..30dccb971ae18 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -19,9 +19,11 @@ from pandas import compat, _np_version_under1p7 from pandas.compat import map, zip, lrange, string_types, isidentifier from pandas.core.common import (isnull, notnull, is_list_like, - _values_from_object, _maybe_promote, ABCSeries) + _values_from_object, _maybe_promote, ABCSeries, + SettingWithCopyError, SettingWithCopyWarning) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution +from pandas.core import config # goal is to be able to define the docs close to function, while still being # able to share @@ -69,7 +71,7 @@ class NDFrame(PandasObject): copy : boolean, default False """ _internal_names = [ - '_data', 'name', '_cacher', '_subtyp', '_index', '_default_kind', '_default_fill_value'] + '_data', 'name', '_cacher', '_is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value'] _internal_names_set = set(_internal_names) _metadata = [] @@ -85,6 +87,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None, fastpath=False): for i, ax in enumerate(axes): data = data.reindex_axis(ax, axis=i) + object.__setattr__(self, '_is_copy', False) object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) @@ -988,6 +991,22 @@ def _set_item(self, key, value): self._data.set(key, value) self._clear_item_cache() + def _setitem_copy(self, copy): + """ set the _is_copy of the iiem """ + self._is_copy = copy + return self + + def _check_setitem_copy(self): + """ validate if we are doing a settitem on a chained copy """ + if self._is_copy: + value = config._get_option_fast('mode.chained_assignment') + + t = "A value is trying to be set on a copy of a slice from a DataFrame.\nTry using .loc[row_index,col_indexer] = value instead" + if value == 'raise': + raise SettingWithCopyError(t) + elif value == 'warn': + warnings.warn(t,SettingWithCopyWarning) + def __delitem__(self, key): """ Delete item @@ -1049,7 +1068,7 @@ def take(self, indices, axis=0, convert=True): new_data = self._data.reindex_axis(new_items, indexer=indices, axis=0) else: new_data = self._data.take(indices, axis=baxis) - return self._constructor(new_data).__finalize__(self) + return self._constructor(new_data)._setitem_copy(True).__finalize__(self) # TODO: Check if this was clearer in 0.12 def select(self, crit, axis=0): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 62aa95d270924..ae22d3d406c7e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2567,22 +2567,20 @@ def fast_2d_xs(self, loc, copy=False): """ get a cross sectional for a given location in the items ; handle dups + + return the result and a flag if a copy was actually made """ if len(self.blocks) == 1: result = self.blocks[0].values[:, loc] if copy: result = result.copy() - return result - - if not copy: - raise TypeError('cannot get view of mixed-type or ' - 'non-consolidated DataFrame') + return result, copy items = self.items # non-unique (GH4726) if not items.is_unique: - return self._interleave(items).ravel() + return self._interleave(items).ravel(), True # unique dtype = _interleaved_dtype(self.blocks) @@ -2593,7 +2591,7 @@ def fast_2d_xs(self, loc, copy=False): i = items.get_loc(item) result[i] = blk._try_coerce_result(blk.iget((j, loc))) - return result + return result, True def consolidate(self): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index e62bf2f36d209..3e8202c7ec0b6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -22,7 +22,8 @@ _values_from_object, _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, - ABCSparseArray, _maybe_match_name, _ensure_object) + ABCSparseArray, _maybe_match_name, _ensure_object, + SettingWithCopyError) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) @@ -575,6 +576,8 @@ def __setitem__(self, key, value): try: self._set_with_engine(key, value) return + except (SettingWithCopyError): + raise except (KeyError, ValueError): values = self.values if (com.is_integer(key) @@ -623,6 +626,7 @@ def _set_with_engine(self, key, value): values = self.values try: self.index._engine.set_value(values, key, value) + self._check_setitem_copy() return except KeyError: values[self.index.get_loc(key)] = value diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e0abe7700f28d..ffc40ffbadc39 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11059,9 +11059,11 @@ def test_xs_view(self): dm.xs(2)[:] = 10 self.assert_((dm.xs(2) == 5).all()) + # prior to chained assignment (GH5390) + # this would raise, but now just rrens a copy (and sets _is_copy) # TODO (?): deal with mixed-type fiasco? - with assertRaisesRegexp(TypeError, 'cannot get view of mixed-type'): - self.mixed_frame.xs(self.mixed_frame.index[2], copy=False) + # with assertRaisesRegexp(TypeError, 'cannot get view of mixed-type'): + # self.mixed_frame.xs(self.mixed_frame.index[2], copy=False) # unconsolidated dm['foo'] = 6. diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 2ad9f10d1b990..5732d2ad56a4f 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -987,6 +987,7 @@ def f(): NUM_COLS = 10 col_names = ['A'+num for num in map(str,np.arange(NUM_COLS).tolist())] index_cols = col_names[:5] + df = DataFrame(np.random.randint(5, size=(NUM_ROWS,NUM_COLS)), dtype=np.int64, columns=col_names) df = df.set_index(index_cols).sort_index() grp = df.groupby(level=index_cols[:4]) @@ -1680,6 +1681,61 @@ def test_setitem_cache_updating(self): self.assert_(df.ix[0,'c'] == 0.0) self.assert_(df.ix[7,'c'] == 1.0) + def test_detect_chained_assignment(self): + + pd.set_option('chained_assignment','raise') + + # work with the chain + expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) + df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB')) + self.assert_(not df._is_copy) + + df['A'][0] = -5 + df['A'][1] = -6 + assert_frame_equal(df, expected) + + expected = DataFrame([[-5,2],[np.nan,3.]],columns=list('AB')) + df = DataFrame({ 'A' : np.arange(2), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) + self.assert_(not df._is_copy) + df['A'][0] = -5 + df['A'][1] = np.nan + assert_frame_equal(df, expected) + self.assert_(not df['A']._is_copy) + + # using a copy (the chain), fails + df = DataFrame({ 'A' : np.arange(2), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) + def f(): + df.loc[0]['A'] = -5 + self.assertRaises(com.SettingWithCopyError, f) + + # doc example + df = DataFrame({'a' : ['one', 'one', 'two', + 'three', 'two', 'one', 'six'], + 'c' : np.arange(7) }) + self.assert_(not df._is_copy) + expected = DataFrame({'a' : ['one', 'one', 'two', + 'three', 'two', 'one', 'six'], + 'c' : [42,42,2,3,4,42,6]}) + + def f(): + df[df.a.str.startswith('o')]['c'] = 42 + self.assertRaises(com.SettingWithCopyError, f) + df['c'][df.a.str.startswith('o')] = 42 + assert_frame_equal(df,expected) + + expected = DataFrame({'A':[111,'bbb','ccc'],'B':[1,2,3]}) + df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + df['A'][0] = 111 + def f(): + df.loc[0]['A'] = 111 + self.assertRaises(com.SettingWithCopyError, f) + assert_frame_equal(df,expected) + + # warnings + pd.set_option('chained_assignment','warn') + df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + df.loc[0]['A'] = 111 + def test_floating_index_doc_example(self): index = Index([1.5, 2, 3, 4.5, 5])