diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 926239e7e5dc5..e80b1060e867d 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -31,6 +31,8 @@ Fixed Regressions - Fixed regression in :class:`TimedeltaIndex` where `np.sum(index)` incorrectly returned a zero-dimensional object instead of a scalar (:issue:`25282`) - Fixed regression in ``IntervalDtype`` construction where passing an incorrect string with 'Interval' as a prefix could result in a ``RecursionError``. (:issue:`25338`) +- Fixed regression in :class:`Categorical`, where constructing it from a categorical ``Series`` and an explicit ``categories=`` that differed from that in the ``Series`` created an invalid object which could trigger segfaults. (:issue:`25318`) + .. _whatsnew_0242.enhancements: Enhancements diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3f38785e6619e..73a03b4f71b6f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -323,14 +323,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # we may have dtype.categories be None, and we need to # infer categories in a factorization step futher below - if is_categorical(values): - # GH23814, for perf, if values._values already an instance of - # Categorical, set values to codes, and run fastpath - if (isinstance(values, (ABCSeries, ABCIndexClass)) and - isinstance(values._values, type(self))): - values = values._values.codes.copy() - fastpath = True - if fastpath: self._codes = coerce_indexer_dtype(values, dtype.categories) self._dtype = self._dtype.update_dtype(dtype) @@ -382,7 +374,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values): - old_codes = (values.cat.codes if isinstance(values, ABCSeries) + old_codes = (values._values.codes if isinstance(values, ABCSeries) else values.codes) codes = _recode_for_categories(old_codes, values.dtype.categories, dtype.categories) @@ -2627,6 +2619,9 @@ def _recode_for_categories(codes, old_categories, new_categories): if len(old_categories) == 0: # All null anyway, so just retain the nulls return codes.copy() + elif new_categories.equals(old_categories): + # Same categories, so no need to actually recode + return codes.copy() indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), new_categories) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 25c299692ceca..f07e3aba53cd4 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -212,6 +212,18 @@ def test_constructor(self): c = Categorical(np.array([], dtype='int64'), # noqa categories=[3, 2, 1], ordered=True) + def test_constructor_with_existing_categories(self): + # GH25318: constructing with pd.Series used to bogusly skip recoding + # categories + c0 = Categorical(["a", "b", "c", "a"]) + c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"]) + + c2 = Categorical(c0, categories=c1.categories) + tm.assert_categorical_equal(c1, c2) + + c3 = Categorical(Series(c0), categories=c1.categories) + tm.assert_categorical_equal(c1, c3) + def test_constructor_not_sequence(self): # https://github.com/pandas-dev/pandas/issues/16022 msg = r"^Parameter 'categories' must be list-like, was"