pandas-dev · jbrockmendel · Aug 18, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst
@@ -77,7 +77,7 @@ By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it
 .. ipython:: python
 
     raw_cat = pd.Categorical(
-        ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False
+        [None, "b", "c", None], categories=["b", "c", "d"], ordered=False
     )
     s = pd.Series(raw_cat)
     s
@@ -145,7 +145,7 @@ of :class:`~pandas.api.types.CategoricalDtype`.
 
     from pandas.api.types import CategoricalDtype
 
-    s = pd.Series(["a", "b", "c", "a"])
+    s = pd.Series([None, "b", "c", None])
     cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
     s_cat = s.astype(cat_type)
     s_cat

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -499,11 +499,14 @@ When using ``dtype=CategoricalDtype``, "unexpected" values outside of
 ``dtype.categories`` are treated as missing values.
 
 .. ipython:: python
+   :okwarning:
 
    dtype = CategoricalDtype(["a", "b", "d"])  # No 'c'
    pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1
 
-This matches the behavior of :meth:`Categorical.set_categories`.
+This matches the behavior of :meth:`Categorical.set_categories`. This behavior is
+deprecated. In a future version, the presence of non-NA values that are not
+among the specified categories will raise.
 
 .. note::
 

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -611,6 +611,7 @@ Other Deprecations
 - Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
 - Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
 - Deprecated ``pd.core.internals.api.maybe_infer_ndim`` (:issue:`40226`)
+- Deprecated allowing constructing or casting to :class:`Categorical` with non-NA values that are not present in specified ``dtype.categories`` (:issue:`40996`)
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`,  :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`)
 - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`)
 - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -11,6 +11,7 @@
     cast,
     overload,
 )
+import warnings
 
 import numpy as np
 
@@ -23,6 +24,7 @@
 )
 from pandas._libs.arrays import NDArrayBacked
 from pandas.compat.numpy import function as nv
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.cast import (
@@ -478,7 +480,11 @@ def __init__(
         elif isinstance(values.dtype, CategoricalDtype):
             old_codes = extract_array(values)._codes
             codes = recode_for_categories(
-                old_codes, values.dtype.categories, dtype.categories, copy=copy
+                old_codes,
+                values.dtype.categories,
+                dtype.categories,
+                copy=copy,
+                warn=True,
             )
 
         else:
@@ -530,7 +536,12 @@ def _from_sequence(
 
     def _cast_pointwise_result(self, values) -> ArrayLike:
         res = super()._cast_pointwise_result(values)
-        cat = type(self)._from_sequence(res, dtype=self.dtype)
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                "Constructing a Categorical with a dtype and values containing",
+            )
+            cat = type(self)._from_sequence(res, dtype=self.dtype)
         if (cat.isna() == isna(res)).all():
             # i.e. the conversion was non-lossy
             return cat
@@ -567,6 +578,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
             dtype = self.dtype.update_dtype(dtype)
             self = self.copy() if copy else self
             result = self._set_dtype(dtype, copy=False)
+            wrong = result.isna() & ~self.isna()
+            if wrong.any():
+                warnings.warn(
+                    "Constructing a Categorical with a dtype and values containing "
+                    "non-null entries not in that dtype's categories is deprecated "
+                    "and will raise in a future version.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
 
         elif isinstance(dtype, ExtensionDtype):
             return super().astype(dtype, copy=copy)
@@ -661,14 +681,16 @@ def _from_inferred_categories(
         if known_categories:
             # Recode from observation order to dtype.categories order.
             categories = dtype.categories
-            codes = recode_for_categories(inferred_codes, cats, categories, copy=False)
+            codes = recode_for_categories(
+                inferred_codes, cats, categories, copy=False, warn=True
+            )
         elif not cats.is_monotonic_increasing:
             # Sort categories and recode for unknown categories.
             unsorted = cats.copy()
             categories = cats.sort_values()
 
             codes = recode_for_categories(
-                inferred_codes, unsorted, categories, copy=False
+                inferred_codes, unsorted, categories, copy=False, warn=True
             )
             dtype = CategoricalDtype(categories, ordered=False)
         else:
@@ -789,7 +811,7 @@ def categories(self) -> Index:
         >>> ser.cat.categories
         Index(['a', 'b', 'c'], dtype='str')
 
-        >>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"])
+        >>> raw_cat = pd.Categorical([None, "b", "c", None], categories=["b", "c", "d"])
         >>> ser = pd.Series(raw_cat)
         >>> ser.cat.categories
         Index(['b', 'c', 'd'], dtype='str')
@@ -1097,7 +1119,7 @@ def set_categories(
         For :class:`pandas.Series`:
 
         >>> raw_cat = pd.Categorical(
-        ...     ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True
+        ...     ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True
         ... )
         >>> ser = pd.Series(raw_cat)
         >>> ser
@@ -1119,7 +1141,7 @@ def set_categories(
         For :class:`pandas.CategoricalIndex`:
 
         >>> ci = pd.CategoricalIndex(
-        ...     ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True
+        ...     ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True
         ... )
         >>> ci
         CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'],
@@ -1147,7 +1169,7 @@ def set_categories(
             codes = cat._codes
         else:
             codes = recode_for_categories(
-                cat.codes, cat.categories, new_dtype.categories, copy=False
+                cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False
             )
         NDArrayBacked.__init__(cat, codes, new_dtype)
         return cat
@@ -2960,7 +2982,7 @@ def codes(self) -> Series:
 
         Examples
         --------
-        >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"])
+        >>> raw_cate = pd.Categorical(["a", "b", None, "a"], categories=["a", "b"])
         >>> ser = pd.Series(raw_cate)
         >>> ser.cat.codes
         0   0
@@ -2995,11 +3017,25 @@ def _get_codes_for_values(
     If `values` is known to be a Categorical, use recode_for_categories instead.
     """
     codes = categories.get_indexer_for(values)
+    wrong = (codes == -1) & ~isna(values)
+    if wrong.any():
+        warnings.warn(
+            "Constructing a Categorical with a dtype and values containing "
+            "non-null entries not in that dtype's categories is deprecated "
+            "and will raise in a future version.",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
     return coerce_indexer_dtype(codes, categories)
 
 
 def recode_for_categories(
-    codes: np.ndarray, old_categories, new_categories, *, copy: bool
+    codes: np.ndarray,
+    old_categories,
+    new_categories,
+    *,
+    copy: bool = True,
+    warn: bool = False,
 ) -> np.ndarray:
     """
     Convert a set of codes for to a new set of categories
@@ -3010,6 +3046,8 @@ def recode_for_categories(
     old_categories, new_categories : Index
     copy: bool, default True
         Whether to copy if the codes are unchanged.
+    warn : bool, default False
+        Whether to warn on silent-NA mapping.
 
     Returns
     -------
@@ -3034,9 +3072,18 @@ def recode_for_categories(
             return codes.copy()
         return codes
 
-    indexer = coerce_indexer_dtype(
-        new_categories.get_indexer_for(old_categories), new_categories
-    )
+    codes_in_old_cats = new_categories.get_indexer_for(old_categories)
+    if warn:
+        wrong = codes_in_old_cats == -1
+        if wrong.any():
+            warnings.warn(
+                "Constructing a Categorical with a dtype and values containing "
+                "non-null entries not in that dtype's categories is deprecated "
+                "and will raise in a future version.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+    indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories)
     new_codes = take_nd(indexer, codes, fill_value=-1)
     return new_codes
 

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -203,7 +203,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
     Examples
     --------
     >>> t = pd.CategoricalDtype(categories=["b", "a"], ordered=True)
-    >>> pd.Series(["a", "b", "a", "c"], dtype=t)
+    >>> pd.Series(["a", "b", "a", None], dtype=t)
     0      a
     1      b
     2      a

@@ -718,7 +718,7 @@ def groups(self) -> dict[Hashable, Index]:
             return self.groupings[0].groups
         result_index, ids = self.result_index_and_ids
         values = result_index._values
-        categories = Categorical(ids, categories=range(len(result_index)))
+        categories = Categorical.from_codes(ids, categories=range(len(result_index)))
         result = {
             # mypy is not aware that group has to be an integer
             values[group]: self.axis.take(axis_ilocs)  # type: ignore[call-overload]

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -22,7 +22,6 @@
 from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import (
     is_valid_na_for_dtype,
-    isna,
 )
 
 from pandas.core.arrays.categorical import (
@@ -258,6 +257,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
         else:
             values = other
 
+            codes = self.categories.get_indexer(values)
+            if ((codes == -1) & ~values.isna()).any():
+                # GH#37667 see test_equals_non_category
+                raise TypeError(
+                    "categories must match existing categories when appending"
+                )
             cat = Categorical(other, dtype=self.dtype)
             other = CategoricalIndex(cat)
             if not other.isin(values).all():
@@ -266,12 +271,6 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
                 )
             cat = other._values
 
-            if not ((cat == values) | (isna(cat) & isna(values))).all():
-                # GH#37667 see test_equals_non_category
-                raise TypeError(
-                    "categories must match existing categories when appending"
-                )
-
         return cat
 
     def equals(self, other: object) -> bool:

diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py
@@ -289,8 +289,16 @@ def test_set_categories(self):
         ],
     )
     def test_set_categories_many(self, values, categories, new_categories, ordered):
-        c = Categorical(values, categories)
-        expected = Categorical(values, new_categories, ordered)
+        msg = "Constructing a Categorical with a dtype and values containing"
+
+        warn1 = FutureWarning if set(values).difference(categories) else None
+        with tm.assert_produces_warning(warn1, match=msg):
+            c = Categorical(values, categories)
+
+        warn2 = FutureWarning if set(values).difference(new_categories) else None
+        with tm.assert_produces_warning(warn2, match=msg):
+            expected = Categorical(values, new_categories, ordered)
+
         result = c.set_categories(new_categories, ordered=ordered)
         tm.assert_categorical_equal(result, expected)
 

diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py
@@ -121,8 +121,11 @@ def test_astype_category(self, dtype_ordered, ordered):
 
         # non-standard categories
         dtype = CategoricalDtype(list("adc"), dtype_ordered)
-        result = cat.astype(dtype)
-        expected = Categorical(data, dtype=dtype)
+        msg = "Constructing a Categorical with a dtype and values containing"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            result = cat.astype(dtype)
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            expected = Categorical(data, dtype=dtype)
         tm.assert_categorical_equal(result, expected)
 
         if dtype_ordered is False: