From ef37e1ddffda856728bd6aa38a6982888981c06b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Oct 2025 13:26:14 -0700 Subject: [PATCH 1/3] DEPR: back-compat shim for select_dtypes --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 21 +++++++++++++ .../tests/frame/methods/test_select_dtypes.py | 31 ++++++++++++++----- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 0045fc7b9c221..94a3f290ea781 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -716,6 +716,7 @@ Other Deprecations - Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`) - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.unstack` and :meth:`DataFrame.unstack` (:issue:`12189`, :issue:`53868`) - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.shift` and :meth:`DataFrame.shift` (:issue:`53802`) +- Deprecated backward-compatibility behavior for :meth:`DataFrame.select_dtypes` matching "str" dtype when ``np.object_`` is specified (:issue:`61916`) - Deprecated option "future.no_silent_downcasting", as it is no longer used. In a future version accessing this option will raise (:issue:`59502`) - Deprecated slicing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` using a ``datetime.date`` object, explicitly cast to :class:`Timestamp` instead (:issue:`35830`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c41b82bbbc8e..bb498997551a7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5237,6 +5237,27 @@ def predicate(arr: ArrayLike) -> bool: return True + blk_dtypes = [blk.dtype for blk in self._mgr.blocks] + if ( + np.object_ in include + and str not in include + and str not in exclude + and any( + isinstance(dtype, StringDtype) and dtype.na_value is np.nan + for dtype in blk_dtypes + ) + ): + # GH#61916 + warnings.warn( + "For backward compatibility, 'str' dtypes are included by " + "select_dtypes when object dtypes are specified. " + "This behavior is deprecated and will be removed in a future " + "version. Explicitly pass 'str' to `include` to select them, " + "or to `exclude` to remove them and silence this warning.", + Pandas4Warning, + stacklevel=find_stack_level(), + ) + mgr = self._mgr._get_data_subset(predicate).copy(deep=False) return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 1ba6b9c437726..c6aff45582dd7 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.errors import Pandas4Warning + from pandas.core.dtypes.dtypes import ExtensionDtype import pandas as pd @@ -102,7 +104,12 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ri = df.select_dtypes(include=[str]) tm.assert_frame_equal(ri, ei) - ri = df.select_dtypes(include=["object"]) + msg = "For backward compatibility, 'str' dtypes are included" + warn = None + if using_infer_string: + warn = Pandas4Warning + with tm.assert_produces_warning(warn, match=msg): + ri = df.select_dtypes(include=["object"]) ei = df[["a"]] tm.assert_frame_equal(ri, ei) @@ -312,15 +319,18 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin ) df["g"] = df.f.diff() assert not hasattr(np, "u8") - r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - # if using_infer_string: - # TODO warn + + msg = "For backward compatibility, 'str' dtypes are included" + warn = None + if using_infer_string: + warn = Pandas4Warning + with tm.assert_produces_warning(warn, match=msg): + r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) e = df[["a", "b"]] tm.assert_frame_equal(r, e) - r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - # if using_infer_string: - # TODO warn + with tm.assert_produces_warning(warn, match=msg): + r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) @@ -497,7 +507,12 @@ def test_select_dtype_object_and_str(self, using_infer_string): ) # with "object" -> only select the object or default str dtype column - result = df.select_dtypes(include=["object"]) + msg = "For backward compatibility, 'str' dtypes are included" + warn = None + if using_infer_string: + warn = Pandas4Warning + with tm.assert_produces_warning(warn, match=msg): + result = df.select_dtypes(include=["object"]) expected = df[["a"]] tm.assert_frame_equal(result, expected) From 3d2fcfd16ad771d163534d176a71acede825dd88 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Oct 2025 14:05:55 -0700 Subject: [PATCH 2/3] update basics doc --- doc/source/user_guide/basics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 3fdd15462b51e..8d0d9ede1b41b 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2372,11 +2372,11 @@ integers: df.select_dtypes(include=["number", "bool"], exclude=["unsignedinteger"]) -To select string columns you must use the ``object`` dtype: +To select string columns include ``str``: .. ipython:: python - df.select_dtypes(include=["object"]) + df.select_dtypes(include=[str]) To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you can define a function that returns a tree of child dtypes: From 8f9d14a51bcb3b74fe85bb283bb8cda5908024d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 16 Oct 2025 15:38:14 -0700 Subject: [PATCH 3/3] Suggested edits --- doc/source/user_guide/basics.rst | 4 ++++ pandas/core/frame.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 8d0d9ede1b41b..9ff63a909d0ab 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -2378,6 +2378,10 @@ To select string columns include ``str``: df.select_dtypes(include=[str]) +.. note:: + + This is a change in pandas 3.0. Previously strings were stored in ``object`` dtype columns, so would be selected with ``include=[object]``. See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#hardcoded-use-of-object-dtype. + To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you can define a function that returns a tree of child dtypes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb498997551a7..86b463bfc7b81 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5250,7 +5250,7 @@ def predicate(arr: ArrayLike) -> bool: # GH#61916 warnings.warn( "For backward compatibility, 'str' dtypes are included by " - "select_dtypes when object dtypes are specified. " + "select_dtypes when 'object' dtype is specified. " "This behavior is deprecated and will be removed in a future " "version. Explicitly pass 'str' to `include` to select them, " "or to `exclude` to remove them and silence this warning.",