Skip to content
Closed
40 changes: 16 additions & 24 deletions pandas/core/dtypes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,31 +482,23 @@ def _hash_categories(categories, ordered: Ordered = True) -> int:
from pandas.core.util.hashing import (
hash_array,
_combine_hash_arrays,
hash_tuples,
)
from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE

if len(categories) and isinstance(categories[0], tuple):
# assumes if any individual category is a tuple, then all our. ATM
# I don't really want to support just some of the categories being
# tuples.
categories = list(categories) # breaks if a np.array of categories
cat_array = hash_tuples(categories)
else:
if categories.dtype == "O":
if len({type(x) for x in categories}) != 1:
# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
# {'1', '2'} the same as {'1', 2}
# find a better solution
hashed = hash((tuple(categories), ordered))
return hashed

if is_datetime64tz_dtype(categories.dtype):
# Avoid future warning.
categories = categories.astype(DT64NS_DTYPE)

cat_array = hash_array(np.asarray(categories), categorize=False)

if categories.dtype == "O":
if len({type(x) for x in categories}) != 1:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we'd like to avoid calling type on every element of categories. I'm guessing that will be prohibitively slow for large categories.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Can you comment on why something like hash(str(np.asarray(categories))) wasnt used?

The imports from core.util.hashing are a hassle dependency-structure-wise, so if there is a simplification available itd be helpful, but not urgent.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure offhand.

# TODO: hash_array doesn't handle mixed types. It casts
# everything to a str first, which means we treat
# {'1', '2'} the same as {'1', 2}
# find a better solution
hashed = hash((tuple(categories), ordered))
return hashed

if isinstance(categories.dtype, DatetimeTZDtype):
# Avoid future warning.
categories = categories.astype("datetime64[ns]")

cat_array = hash_array(np.asarray(categories), categorize=False)

if ordered:
cat_array = np.vstack(
[cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]
Expand Down
2 changes: 0 additions & 2 deletions pandas/core/util/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,6 @@ def hash_array(

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
elif isinstance(dtype, np.bool):
vals = vals.astype("u8")
elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
vals = vals.view("i8").astype("u8", copy=False)
elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
Expand Down