Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -938,6 +938,7 @@ Bug fixes

Categorical
^^^^^^^^^^^
- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`)
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
- Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`)
- Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)
Expand Down
13 changes: 12 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,10 @@ def __init__(
codes = arr.indices.to_numpy()
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
else:
preserve_object = False
if isinstance(values, (ABCIndex, ABCSeries)) and values.dtype == object:
# GH#61778
preserve_object = True
if not isinstance(values, ABCIndex):
# in particular RangeIndex xref test_index_equal_range_categories
values = sanitize_array(values, None)
Expand All @@ -476,7 +480,14 @@ def __init__(
"by passing in a categories argument."
) from err

# we're inferring from values
if preserve_object:
# GH#61778 wrap categories in an Index to prevent dtype
# inference in the CategoricalDtype constructor
from pandas import Index

categories = Index(categories, dtype=object, copy=False)

# if not preserve_obejct, we're inferring from values
dtype = CategoricalDtype(categories, dtype.ordered)

elif isinstance(values.dtype, CategoricalDtype):
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,3 +809,28 @@ def test_range_values_preserves_rangeindex_categories(self, values, categories):
result = Categorical(values=values, categories=categories).categories
expected = RangeIndex(range(5))
tm.assert_index_equal(result, expected, exact=True)

def test_categorical_preserve_object_dtype_from_pandas(self, using_infer_string):
# GH#61778
pylist = ["foo", "bar", "baz"]
ser = Series(pylist, dtype="object")
idx = Index(pylist, dtype="object")
arr = np.array(pylist, dtype="object")

cat_from_ser = Categorical(ser)
cat_from_idx = Categorical(idx)
cat_from_arr = Categorical(arr)
cat_from_list = Categorical(pylist)

# Series/Index with object dtype: infer string
# dtype if all elements are strings
assert cat_from_ser.categories.dtype == object
assert cat_from_idx.categories.dtype == object

if using_infer_string:
# Numpy array or list: infer string dtype
assert cat_from_arr.categories.dtype == "str"
assert cat_from_list.categories.dtype == "str"
else:
assert cat_from_arr.categories.dtype == object
assert cat_from_list.categories.dtype == object
4 changes: 4 additions & 0 deletions pandas/tests/groupby/methods/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ def test_against_frame_and_seriesgroupby(
index_frame = expected.index.to_frame(index=False)
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
both_dtype = index_frame["both"].dtype
index_frame = index_frame.astype(
{"gender": both_dtype, "education": both_dtype}
)
del index_frame["both"]
index_frame2 = index_frame.rename({0: None}, axis=1)
expected.index = MultiIndex.from_frame(index_frame2)
Expand Down
2 changes: 0 additions & 2 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2673,8 +2673,6 @@ def test_pivot_integer_bug(self, any_string_dtype):

result = df.pivot(index=1, columns=0, values=2)
expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype)
if any_string_dtype == "object":
expected_columns = expected_columns.astype("str")
tm.assert_index_equal(result.columns, expected_columns)

def test_pivot_index_none(self):
Expand Down
Loading