diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c41524619de95..2f67a569de5bd 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -938,6 +938,7 @@ Bug fixes Categorical ^^^^^^^^^^^ +- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`) - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) - Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`) - Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b570cc90e4948..6f572d0f72df5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -460,6 +460,10 @@ def __init__( codes = arr.indices.to_numpy() dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) else: + preserve_object = False + if isinstance(values, (ABCIndex, ABCSeries)) and values.dtype == object: + # GH#61778 + preserve_object = True if not isinstance(values, ABCIndex): # in particular RangeIndex xref test_index_equal_range_categories values = sanitize_array(values, None) @@ -476,7 +480,14 @@ def __init__( "by passing in a categories argument." ) from err - # we're inferring from values + if preserve_object: + # GH#61778 wrap categories in an Index to prevent dtype + # inference in the CategoricalDtype constructor + from pandas import Index + + categories = Index(categories, dtype=object, copy=False) + + # if not preserve_obejct, we're inferring from values dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index a801dea66f167..c14c3b10f48e0 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -809,3 +809,28 @@ def test_range_values_preserves_rangeindex_categories(self, values, categories): result = Categorical(values=values, categories=categories).categories expected = RangeIndex(range(5)) tm.assert_index_equal(result, expected, exact=True) + + def test_categorical_preserve_object_dtype_from_pandas(self, using_infer_string): + # GH#61778 + pylist = ["foo", "bar", "baz"] + ser = Series(pylist, dtype="object") + idx = Index(pylist, dtype="object") + arr = np.array(pylist, dtype="object") + + cat_from_ser = Categorical(ser) + cat_from_idx = Categorical(idx) + cat_from_arr = Categorical(arr) + cat_from_list = Categorical(pylist) + + # Series/Index with object dtype: infer string + # dtype if all elements are strings + assert cat_from_ser.categories.dtype == object + assert cat_from_idx.categories.dtype == object + + if using_infer_string: + # Numpy array or list: infer string dtype + assert cat_from_arr.categories.dtype == "str" + assert cat_from_list.categories.dtype == "str" + else: + assert cat_from_arr.categories.dtype == object + assert cat_from_list.categories.dtype == object diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 536e94483c36f..519c2c3064e59 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -347,6 +347,10 @@ def test_against_frame_and_seriesgroupby( index_frame = expected.index.to_frame(index=False) index_frame["gender"] = index_frame["both"].str.split("-").str.get(0) index_frame["education"] = index_frame["both"].str.split("-").str.get(1) + both_dtype = index_frame["both"].dtype + index_frame = index_frame.astype( + {"gender": both_dtype, "education": both_dtype} + ) del index_frame["both"] index_frame2 = index_frame.rename({0: None}, axis=1) expected.index = MultiIndex.from_frame(index_frame2) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5202d320108c7..1bb4e35e041d0 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2673,8 +2673,6 @@ def test_pivot_integer_bug(self, any_string_dtype): result = df.pivot(index=1, columns=0, values=2) expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) - if any_string_dtype == "object": - expected_columns = expected_columns.astype("str") tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self):