|
9 | 9 | Hashable, |
10 | 10 | Iterable, |
11 | 11 | Iterator, |
12 | | - cast, |
13 | 12 | ) |
14 | 13 |
|
15 | 14 | import numpy as np |
16 | 15 |
|
17 | | -from pandas._libs import lib |
18 | 16 | from pandas._libs.hashing import hash_object_array |
19 | 17 | from pandas._typing import ( |
20 | 18 | ArrayLike, |
21 | 19 | npt, |
22 | 20 | ) |
23 | 21 |
|
24 | | -from pandas.core.dtypes.common import ( |
25 | | - is_categorical_dtype, |
26 | | - is_list_like, |
27 | | -) |
| 22 | +from pandas.core.dtypes.common import is_list_like |
28 | 23 | from pandas.core.dtypes.generic import ( |
29 | 24 | ABCDataFrame, |
30 | 25 | ABCExtensionArray, |
|
35 | 30 |
|
36 | 31 | if TYPE_CHECKING: |
37 | 32 | from pandas import ( |
38 | | - Categorical, |
39 | 33 | DataFrame, |
40 | 34 | Index, |
41 | 35 | MultiIndex, |
@@ -214,53 +208,14 @@ def hash_tuples( |
214 | 208 |
|
215 | 209 | # hash the list-of-ndarrays |
216 | 210 | hashes = ( |
217 | | - _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals |
| 211 | + cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False) |
| 212 | + for cat in cat_vals |
218 | 213 | ) |
219 | 214 | h = combine_hash_arrays(hashes, len(cat_vals)) |
220 | 215 |
|
221 | 216 | return h |
222 | 217 |
|
223 | 218 |
|
224 | | -def _hash_categorical( |
225 | | - cat: Categorical, encoding: str, hash_key: str |
226 | | -) -> npt.NDArray[np.uint64]: |
227 | | - """ |
228 | | - Hash a Categorical by hashing its categories, and then mapping the codes |
229 | | - to the hashes |
230 | | -
|
231 | | - Parameters |
232 | | - ---------- |
233 | | - cat : Categorical |
234 | | - encoding : str |
235 | | - hash_key : str |
236 | | -
|
237 | | - Returns |
238 | | - ------- |
239 | | - ndarray[np.uint64] of hashed values, same size as len(c) |
240 | | - """ |
241 | | - # Convert ExtensionArrays to ndarrays |
242 | | - values = np.asarray(cat.categories._values) |
243 | | - hashed = hash_array(values, encoding, hash_key, categorize=False) |
244 | | - |
245 | | - # we have uint64, as we don't directly support missing values |
246 | | - # we don't want to use take_nd which will coerce to float |
247 | | - # instead, directly construct the result with a |
248 | | - # max(np.uint64) as the missing value indicator |
249 | | - # |
250 | | - # TODO: GH 15362 |
251 | | - |
252 | | - mask = cat.isna() |
253 | | - if len(hashed): |
254 | | - result = hashed.take(cat.codes) |
255 | | - else: |
256 | | - result = np.zeros(len(mask), dtype="uint64") |
257 | | - |
258 | | - if mask.any(): |
259 | | - result[mask] = lib.u8max |
260 | | - |
261 | | - return result |
262 | | - |
263 | | - |
264 | 219 | def hash_array( |
265 | 220 | vals: ArrayLike, |
266 | 221 | encoding: str = "utf8", |
@@ -288,17 +243,11 @@ def hash_array( |
288 | 243 | """ |
289 | 244 | if not hasattr(vals, "dtype"): |
290 | 245 | raise TypeError("must pass a ndarray-like") |
291 | | - dtype = vals.dtype |
292 | | - |
293 | | - # For categoricals, we hash the categories, then remap the codes to the |
294 | | - # hash values. (This check is above the complex check so that we don't ask |
295 | | - # numpy if categorical is a subdtype of complex, as it will choke). |
296 | | - if is_categorical_dtype(dtype): |
297 | | - vals = cast("Categorical", vals) |
298 | | - return _hash_categorical(vals, encoding, hash_key) |
299 | 246 |
|
300 | | - elif isinstance(vals, ABCExtensionArray): |
301 | | - vals, _ = vals._values_for_factorize() |
| 247 | + if isinstance(vals, ABCExtensionArray): |
| 248 | + return vals._hash_pandas_object( |
| 249 | + encoding=encoding, hash_key=hash_key, categorize=categorize |
| 250 | + ) |
302 | 251 |
|
303 | 252 | elif not isinstance(vals, np.ndarray): |
304 | 253 | # GH#42003 |
@@ -347,7 +296,9 @@ def _hash_ndarray( |
347 | 296 |
|
348 | 297 | codes, categories = factorize(vals, sort=False) |
349 | 298 | cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) |
350 | | - return _hash_categorical(cat, encoding, hash_key) |
| 299 | + return cat._hash_pandas_object( |
| 300 | + encoding=encoding, hash_key=hash_key, categorize=False |
| 301 | + ) |
351 | 302 |
|
352 | 303 | try: |
353 | 304 | vals = hash_object_array(vals, hash_key, encoding) |
|
0 commit comments