Skip to content

DEPR: Categorical with values not present in categories #62142

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
4 changes: 2 additions & 2 deletions doc/source/user_guide/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it
.. ipython:: python

raw_cat = pd.Categorical(
["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False
[None, "b", "c", None], categories=["b", "c", "d"], ordered=False
)
s = pd.Series(raw_cat)
s
Expand Down Expand Up @@ -145,7 +145,7 @@ of :class:`~pandas.api.types.CategoricalDtype`.

from pandas.api.types import CategoricalDtype

s = pd.Series(["a", "b", "c", "a"])
s = pd.Series([None, "b", "c", None])
cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)
s_cat = s.astype(cat_type)
s_cat
Expand Down
5 changes: 4 additions & 1 deletion doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -499,11 +499,14 @@ When using ``dtype=CategoricalDtype``, "unexpected" values outside of
``dtype.categories`` are treated as missing values.

.. ipython:: python
:okwarning:

dtype = CategoricalDtype(["a", "b", "d"]) # No 'c'
pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1

This matches the behavior of :meth:`Categorical.set_categories`.
This matches the behavior of :meth:`Categorical.set_categories`. This behavior is
deprecated. In a future version, the presence of non-NA values that are not
among the specified categories will raise.

.. note::

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ Other Deprecations
- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
- Deprecated ``pd.core.internals.api.maybe_infer_ndim`` (:issue:`40226`)
- Deprecated allowing constructing or casting to :class:`Categorical` with non-NA values that are not present in specified ``dtype.categories`` (:issue:`40996`)
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`)
- Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`)
- Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`)
Expand Down
73 changes: 60 additions & 13 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
cast,
overload,
)
import warnings

import numpy as np

Expand All @@ -23,6 +24,7 @@
)
from pandas._libs.arrays import NDArrayBacked
from pandas.compat.numpy import function as nv
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -478,7 +480,11 @@ def __init__(
elif isinstance(values.dtype, CategoricalDtype):
old_codes = extract_array(values)._codes
codes = recode_for_categories(
old_codes, values.dtype.categories, dtype.categories, copy=copy
old_codes,
values.dtype.categories,
dtype.categories,
copy=copy,
warn=True,
)

else:
Expand Down Expand Up @@ -530,7 +536,12 @@ def _from_sequence(

def _cast_pointwise_result(self, values) -> ArrayLike:
res = super()._cast_pointwise_result(values)
cat = type(self)._from_sequence(res, dtype=self.dtype)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Constructing a Categorical with a dtype and values containing",
)
cat = type(self)._from_sequence(res, dtype=self.dtype)
if (cat.isna() == isna(res)).all():
# i.e. the conversion was non-lossy
return cat
Expand Down Expand Up @@ -567,6 +578,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
dtype = self.dtype.update_dtype(dtype)
self = self.copy() if copy else self
result = self._set_dtype(dtype, copy=False)
wrong = result.isna() & ~self.isna()
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)

elif isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
Expand Down Expand Up @@ -661,14 +681,16 @@ def _from_inferred_categories(
if known_categories:
# Recode from observation order to dtype.categories order.
categories = dtype.categories
codes = recode_for_categories(inferred_codes, cats, categories, copy=False)
codes = recode_for_categories(
inferred_codes, cats, categories, copy=False, warn=True
)
elif not cats.is_monotonic_increasing:
# Sort categories and recode for unknown categories.
unsorted = cats.copy()
categories = cats.sort_values()

codes = recode_for_categories(
inferred_codes, unsorted, categories, copy=False
inferred_codes, unsorted, categories, copy=False, warn=True
)
dtype = CategoricalDtype(categories, ordered=False)
else:
Expand Down Expand Up @@ -789,7 +811,7 @@ def categories(self) -> Index:
>>> ser.cat.categories
Index(['a', 'b', 'c'], dtype='str')

>>> raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"])
>>> raw_cat = pd.Categorical([None, "b", "c", None], categories=["b", "c", "d"])
>>> ser = pd.Series(raw_cat)
>>> ser.cat.categories
Index(['b', 'c', 'd'], dtype='str')
Expand Down Expand Up @@ -1097,7 +1119,7 @@ def set_categories(
For :class:`pandas.Series`:

>>> raw_cat = pd.Categorical(
... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True
... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True
... )
>>> ser = pd.Series(raw_cat)
>>> ser
Expand All @@ -1119,7 +1141,7 @@ def set_categories(
For :class:`pandas.CategoricalIndex`:

>>> ci = pd.CategoricalIndex(
... ["a", "b", "c", "A"], categories=["a", "b", "c"], ordered=True
... ["a", "b", "c", None], categories=["a", "b", "c"], ordered=True
... )
>>> ci
CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'],
Expand Down Expand Up @@ -1147,7 +1169,7 @@ def set_categories(
codes = cat._codes
else:
codes = recode_for_categories(
cat.codes, cat.categories, new_dtype.categories, copy=False
cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False
Comment on lines 1171 to +1172
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the deprecation is enforced, do we still need to pass the flag on whether to allow values outside the categories?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the flag will become a "should we raise" flag, and will still be needed bc the answer will be "no" for set_categories

)
NDArrayBacked.__init__(cat, codes, new_dtype)
return cat
Expand Down Expand Up @@ -2960,7 +2982,7 @@ def codes(self) -> Series:

Examples
--------
>>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"])
>>> raw_cate = pd.Categorical(["a", "b", None, "a"], categories=["a", "b"])
>>> ser = pd.Series(raw_cate)
>>> ser.cat.codes
0 0
Expand Down Expand Up @@ -2995,11 +3017,25 @@ def _get_codes_for_values(
If `values` is known to be a Categorical, use recode_for_categories instead.
"""
codes = categories.get_indexer_for(values)
wrong = (codes == -1) & ~isna(values)
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
return coerce_indexer_dtype(codes, categories)


def recode_for_categories(
codes: np.ndarray, old_categories, new_categories, *, copy: bool
codes: np.ndarray,
old_categories,
new_categories,
*,
copy: bool = True,
warn: bool = False,
) -> np.ndarray:
"""
Convert a set of codes for to a new set of categories
Expand All @@ -3010,6 +3046,8 @@ def recode_for_categories(
old_categories, new_categories : Index
copy: bool, default True
Whether to copy if the codes are unchanged.
warn : bool, default False
Whether to warn on silent-NA mapping.

Returns
-------
Expand All @@ -3034,9 +3072,18 @@ def recode_for_categories(
return codes.copy()
return codes

indexer = coerce_indexer_dtype(
new_categories.get_indexer_for(old_categories), new_categories
)
codes_in_old_cats = new_categories.get_indexer_for(old_categories)
if warn:
wrong = codes_in_old_cats == -1
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories)
new_codes = take_nd(indexer, codes, fill_value=-1)
return new_codes

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
Examples
--------
>>> t = pd.CategoricalDtype(categories=["b", "a"], ordered=True)
>>> pd.Series(["a", "b", "a", "c"], dtype=t)
>>> pd.Series(["a", "b", "a", None], dtype=t)
0 a
1 b
2 a
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,7 @@ def groups(self) -> dict[Hashable, Index]:
return self.groupings[0].groups
result_index, ids = self.result_index_and_ids
values = result_index._values
categories = Categorical(ids, categories=range(len(result_index)))
categories = Categorical.from_codes(ids, categories=range(len(result_index)))
result = {
# mypy is not aware that group has to be an integer
values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload]
Expand Down
13 changes: 6 additions & 7 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
)

from pandas.core.arrays.categorical import (
Expand Down Expand Up @@ -258,6 +257,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
else:
values = other

codes = self.categories.get_indexer(values)
if ((codes == -1) & ~values.isna()).any():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need the check on L269 with this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think not, will check and update

# GH#37667 see test_equals_non_category
raise TypeError(
"categories must match existing categories when appending"
)
cat = Categorical(other, dtype=self.dtype)
other = CategoricalIndex(cat)
if not other.isin(values).all():
Expand All @@ -266,12 +271,6 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
)
cat = other._values

if not ((cat == values) | (isna(cat) & isna(values))).all():
# GH#37667 see test_equals_non_category
raise TypeError(
"categories must match existing categories when appending"
)

return cat

def equals(self, other: object) -> bool:
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/arrays/categorical/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,16 @@ def test_set_categories(self):
],
)
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
msg = "Constructing a Categorical with a dtype and values containing"

warn1 = FutureWarning if set(values).difference(categories) else None
with tm.assert_produces_warning(warn1, match=msg):
c = Categorical(values, categories)

warn2 = FutureWarning if set(values).difference(new_categories) else None
with tm.assert_produces_warning(warn2, match=msg):
expected = Categorical(values, new_categories, ordered)

result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)

Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/arrays/categorical/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,11 @@ def test_astype_category(self, dtype_ordered, ordered):

# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = cat.astype(dtype)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)

if dtype_ordered is False:
Expand Down
Loading
Loading