-
-
Notifications
You must be signed in to change notification settings - Fork 17.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: add data hashing routines (#14729)
xref dask/dask#1807
- Loading branch information
Showing
5 changed files
with
498 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# cython: profile=False | ||
# Translated from the reference implementation | ||
# at https://github.com/veorq/SipHash | ||
|
||
import cython | ||
cimport numpy as cnp | ||
import numpy as np | ||
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t | ||
|
||
from cpython cimport (PyString_Check, | ||
PyBytes_Check, | ||
PyUnicode_Check) | ||
from libc.stdlib cimport malloc, free | ||
|
||
DEF cROUNDS = 2 | ||
DEF dROUNDS = 4 | ||
|
||
|
||
@cython.boundscheck(False) | ||
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): | ||
""" | ||
Parameters | ||
---------- | ||
arr : 1-d object ndarray of objects | ||
key : hash key, must be 16 byte len encoded | ||
encoding : encoding for key & arr, default to 'utf8' | ||
Returns | ||
------- | ||
1-d uint64 ndarray of hashes | ||
""" | ||
cdef: | ||
Py_ssize_t i, l, n | ||
ndarray[uint64_t] result | ||
bytes data, k | ||
uint8_t *kb, *lens | ||
char **vecs, *cdata | ||
object val | ||
|
||
k = <bytes>key.encode(encoding) | ||
kb = <uint8_t *>k | ||
if len(k) != 16: | ||
raise ValueError( | ||
'key should be a 16-byte string encoded, got {!r} (len {})'.format( | ||
k, len(k))) | ||
|
||
n = len(arr) | ||
|
||
# create an array of bytes | ||
vecs = <char **> malloc(n * sizeof(char *)) | ||
lens = <uint8_t*> malloc(n * sizeof(uint8_t)) | ||
|
||
cdef list datas = [] | ||
for i in range(n): | ||
val = arr[i] | ||
if PyString_Check(val): | ||
data = <bytes>val.encode(encoding) | ||
elif PyBytes_Check(val): | ||
data = <bytes>val | ||
elif PyUnicode_Check(val): | ||
data = <bytes>val.encode(encoding) | ||
else: | ||
# non-strings | ||
data = <bytes>str(val).encode(encoding) | ||
|
||
l = len(data) | ||
lens[i] = l | ||
cdata = data | ||
|
||
# keep the refernce alive thru the end of the | ||
# function | ||
datas.append(data) | ||
vecs[i] = cdata | ||
|
||
result = np.empty(n, dtype=np.uint64) | ||
with nogil: | ||
for i in range(n): | ||
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb) | ||
|
||
free(vecs) | ||
free(lens) | ||
return result | ||
|
||
cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil: | ||
return (x << b) | (x >> (64 - b)) | ||
|
||
cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil: | ||
p[0] = <uint8_t>(v) | ||
p[1] = <uint8_t>(v >> 8) | ||
p[2] = <uint8_t>(v >> 16) | ||
p[3] = <uint8_t>(v >> 24) | ||
|
||
cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil: | ||
u32to8_le(p, <uint32_t>v) | ||
u32to8_le(p + 4, <uint32_t>(v >> 32)) | ||
|
||
cdef inline uint64_t u8to64_le(uint8_t* p) nogil: | ||
return (<uint64_t>p[0] | | ||
<uint64_t>p[1] << 8 | | ||
<uint64_t>p[2] << 16 | | ||
<uint64_t>p[3] << 24 | | ||
<uint64_t>p[4] << 32 | | ||
<uint64_t>p[5] << 40 | | ||
<uint64_t>p[6] << 48 | | ||
<uint64_t>p[7] << 56) | ||
|
||
cdef inline void _sipround(uint64_t* v0, uint64_t* v1, | ||
uint64_t* v2, uint64_t* v3) nogil: | ||
v0[0] += v1[0] | ||
v1[0] = _rotl(v1[0], 13) | ||
v1[0] ^= v0[0] | ||
v0[0] = _rotl(v0[0], 32) | ||
v2[0] += v3[0] | ||
v3[0] = _rotl(v3[0], 16) | ||
v3[0] ^= v2[0] | ||
v0[0] += v3[0] | ||
v3[0] = _rotl(v3[0], 21) | ||
v3[0] ^= v0[0] | ||
v2[0] += v1[0] | ||
v1[0] = _rotl(v1[0], 17) | ||
v1[0] ^= v2[0] | ||
v2[0] = _rotl(v2[0], 32) | ||
|
||
cpdef uint64_t siphash(bytes data, bytes key) except? 0: | ||
if len(key) != 16: | ||
raise ValueError( | ||
'key should be a 16-byte bytestring, got {!r} (len {})'.format( | ||
key, len(key))) | ||
return low_level_siphash(data, len(data), key) | ||
|
||
|
||
@cython.cdivision(True) | ||
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen, | ||
uint8_t* key) nogil: | ||
cdef uint64_t v0 = 0x736f6d6570736575ULL | ||
cdef uint64_t v1 = 0x646f72616e646f6dULL | ||
cdef uint64_t v2 = 0x6c7967656e657261ULL | ||
cdef uint64_t v3 = 0x7465646279746573ULL | ||
cdef uint64_t b | ||
cdef uint64_t k0 = u8to64_le(key) | ||
cdef uint64_t k1 = u8to64_le(key + 8) | ||
cdef uint64_t m | ||
cdef int i | ||
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t)) | ||
cdef int left = datalen & 7 | ||
cdef int left_byte | ||
|
||
b = (<uint64_t>datalen) << 56 | ||
v3 ^= k1 | ||
v2 ^= k0 | ||
v1 ^= k1 | ||
v0 ^= k0 | ||
|
||
while (data != end): | ||
m = u8to64_le(data) | ||
v3 ^= m | ||
for i in range(cROUNDS): | ||
_sipround(&v0, &v1, &v2, &v3) | ||
v0 ^= m | ||
|
||
data += sizeof(uint64_t) | ||
|
||
for i in range(left-1, -1, -1): | ||
b |= (<uint64_t>data[i]) << (i * 8) | ||
|
||
v3 ^= b | ||
|
||
for i in range(cROUNDS): | ||
_sipround(&v0, &v1, &v2, &v3) | ||
|
||
v0 ^= b | ||
v2 ^= 0xff | ||
|
||
for i in range(dROUNDS): | ||
_sipround(&v0, &v1, &v2, &v3) | ||
|
||
b = v0 ^ v1 ^ v2 ^ v3 | ||
|
||
return b |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
""" | ||
data hash pandas / numpy objects | ||
""" | ||
|
||
import numpy as np | ||
from pandas import _hash, Series, factorize, Categorical, Index | ||
from pandas.lib import infer_dtype | ||
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame | ||
from pandas.types.common import is_categorical_dtype | ||
|
||
# 16 byte long hashing key | ||
_default_hash_key = '0123456789123456' | ||
|
||
|
||
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None): | ||
""" | ||
Return a data hash of the Index/Series/DataFrame | ||
.. versionadded:: 0.19.2 | ||
Parameters | ||
---------- | ||
index : boolean, default True | ||
include the index in the hash (if Series/DataFrame) | ||
encoding : string, default 'utf8' | ||
encoding for data & key when strings | ||
hash_key : string key to encode, default to _default_hash_key | ||
Returns | ||
------- | ||
Series of uint64, same length as the object | ||
""" | ||
if hash_key is None: | ||
hash_key = _default_hash_key | ||
|
||
def adder(h, hashed_to_add): | ||
h = np.multiply(h, np.uint(3), h) | ||
return np.add(h, hashed_to_add, h) | ||
|
||
if isinstance(obj, ABCIndexClass): | ||
h = hash_array(obj.values, encoding, hash_key).astype('uint64') | ||
h = Series(h, index=obj, dtype='uint64') | ||
elif isinstance(obj, ABCSeries): | ||
h = hash_array(obj.values, encoding, hash_key).astype('uint64') | ||
if index: | ||
h = adder(h, hash_pandas_object(obj.index, | ||
index=False, | ||
encoding=encoding, | ||
hash_key=hash_key).values) | ||
h = Series(h, index=obj.index, dtype='uint64') | ||
elif isinstance(obj, ABCDataFrame): | ||
cols = obj.iteritems() | ||
first_series = next(cols)[1] | ||
h = hash_array(first_series.values, encoding, | ||
hash_key).astype('uint64') | ||
for _, col in cols: | ||
h = adder(h, hash_array(col.values, encoding, hash_key)) | ||
if index: | ||
h = adder(h, hash_pandas_object(obj.index, | ||
index=False, | ||
encoding=encoding, | ||
hash_key=hash_key).values) | ||
|
||
h = Series(h, index=obj.index, dtype='uint64') | ||
else: | ||
raise TypeError("Unexpected type for hashing %s" % type(obj)) | ||
return h | ||
|
||
|
||
def hash_array(vals, encoding='utf8', hash_key=None): | ||
""" | ||
Given a 1d array, return an array of deterministic integers. | ||
.. versionadded:: 0.19.2 | ||
Parameters | ||
---------- | ||
vals : ndarray | ||
encoding : string, default 'utf8' | ||
encoding for data & key when strings | ||
hash_key : string key to encode, default to _default_hash_key | ||
Returns | ||
------- | ||
1d uint64 numpy array of hash values, same length as the vals | ||
""" | ||
|
||
# work with cagegoricals as ints. (This check is above the complex | ||
# check so that we don't ask numpy if categorical is a subdtype of | ||
# complex, as it will choke. | ||
if hash_key is None: | ||
hash_key = _default_hash_key | ||
|
||
if is_categorical_dtype(vals.dtype): | ||
vals = vals.codes | ||
|
||
# we'll be working with everything as 64-bit values, so handle this | ||
# 128-bit value early | ||
if np.issubdtype(vals.dtype, np.complex128): | ||
return hash_array(vals.real) + 23 * hash_array(vals.imag) | ||
|
||
# MAIN LOGIC: | ||
inferred = infer_dtype(vals) | ||
|
||
# First, turn whatever array this is into unsigned 64-bit ints, if we can | ||
# manage it. | ||
if inferred == 'boolean': | ||
vals = vals.astype('u8') | ||
|
||
if (np.issubdtype(vals.dtype, np.datetime64) or | ||
np.issubdtype(vals.dtype, np.timedelta64) or | ||
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8: | ||
|
||
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') | ||
else: | ||
|
||
# its MUCH faster to categorize object dtypes, then hash and rename | ||
codes, categories = factorize(vals, sort=False) | ||
categories = Index(categories) | ||
c = Series(Categorical(codes, categories, | ||
ordered=False, fastpath=True)) | ||
vals = _hash.hash_object_array(categories.values, | ||
hash_key, | ||
encoding) | ||
|
||
# rename & extract | ||
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values | ||
|
||
# Then, redistribute these 64-bit ints within the space of 64-bit ints | ||
vals ^= vals >> 30 | ||
vals *= np.uint64(0xbf58476d1ce4e5b9) | ||
vals ^= vals >> 27 | ||
vals *= np.uint64(0x94d049bb133111eb) | ||
vals ^= vals >> 31 | ||
return vals |
Oops, something went wrong.