ENH: add data hashing routines (#14729)

xref dask/dask#1807
pandas-dev · Nov 28, 2016 · 06f26b5 · 06f26b5
1 parent c5f219a
commit 06f26b5
Show file tree

Hide file tree

Showing 5 changed files with 498 additions and 2 deletions.
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+from pandas.util import testing as tm
 
 
 class algorithm(object):
@@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self):
 
     def time_add_overflow_mixed_arr(self):
         self.checked_add(self.arr, self.arrmixed)
+
+
+class hashing(object):
+    goal_time = 0.2
+
+    def setup(self):
+        N = 100000
+
+        self.df = pd.DataFrame(
+            {'A': pd.Series(tm.makeStringIndex(100).take(
+                np.random.randint(0, 100, size=N))),
+             'B': pd.Series(tm.makeStringIndex(10000).take(
+                 np.random.randint(0, 10000, size=N))),
+             'D': np.random.randn(N),
+             'E': np.arange(N),
+             'F': pd.date_range('20110101', freq='s', periods=N),
+             'G': pd.timedelta_range('1 day', freq='s', periods=N),
+             })
+        self.df['C'] = self.df['B'].astype('category')
+        self.df.iloc[10:20] = np.nan
+
+    def time_frame(self):
+        self.df.hash()
+
+    def time_series_int(self):
+        self.df.E.hash()
+
+    def time_series_string(self):
+        self.df.B.hash()
+
+    def time_series_categorical(self):
+        self.df.C.hash()
diff --git a/pandas/src/hash.pyx b/pandas/src/hash.pyx
@@ -0,0 +1,180 @@
+# cython: profile=False
+# Translated from the reference implementation
+# at https://github.com/veorq/SipHash
+
+import cython
+cimport numpy as cnp
+import numpy as np
+from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
+
+from cpython cimport (PyString_Check,
+                      PyBytes_Check,
+                      PyUnicode_Check)
+from libc.stdlib cimport malloc, free
+
+DEF cROUNDS = 2
+DEF dROUNDS = 4
+
+
+@cython.boundscheck(False)
+def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
+    """
+    Parameters
+    ----------
+    arr : 1-d object ndarray of objects
+    key : hash key, must be 16 byte len encoded
+    encoding : encoding for key & arr, default to 'utf8'
+
+    Returns
+    -------
+    1-d uint64 ndarray of hashes
+
+    """
+    cdef:
+        Py_ssize_t i, l, n
+        ndarray[uint64_t] result
+        bytes data, k
+        uint8_t *kb, *lens
+        char **vecs, *cdata
+        object val
+
+    k = <bytes>key.encode(encoding)
+    kb = <uint8_t *>k
+    if len(k) != 16:
+        raise ValueError(
+            'key should be a 16-byte string encoded, got {!r} (len {})'.format(
+                k, len(k)))
+
+    n = len(arr)
+
+    # create an array of bytes
+    vecs = <char **> malloc(n * sizeof(char *))
+    lens = <uint8_t*> malloc(n * sizeof(uint8_t))
+
+    cdef list datas = []
+    for i in range(n):
+        val = arr[i]
+        if PyString_Check(val):
+            data = <bytes>val.encode(encoding)
+        elif PyBytes_Check(val):
+            data = <bytes>val
+        elif PyUnicode_Check(val):
+            data = <bytes>val.encode(encoding)
+        else:
+            # non-strings
+            data = <bytes>str(val).encode(encoding)
+
+        l = len(data)
+        lens[i] = l
+        cdata = data
+
+        # keep the refernce alive thru the end of the
+        # function
+        datas.append(data)
+        vecs[i] = cdata
+
+    result = np.empty(n, dtype=np.uint64)
+    with nogil:
+        for i in range(n):
+            result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)
+
+    free(vecs)
+    free(lens)
+    return result
+
+cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
+    return (x << b) | (x >> (64 - b))
+
+cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
+    p[0] = <uint8_t>(v)
+    p[1] = <uint8_t>(v >> 8)
+    p[2] = <uint8_t>(v >> 16)
+    p[3] = <uint8_t>(v >> 24)
+
+cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
+    u32to8_le(p, <uint32_t>v)
+    u32to8_le(p + 4, <uint32_t>(v >> 32))
+
+cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
+    return (<uint64_t>p[0] |
+            <uint64_t>p[1] <<  8 |
+            <uint64_t>p[2] << 16 |
+            <uint64_t>p[3] << 24 |
+            <uint64_t>p[4] << 32 |
+            <uint64_t>p[5] << 40 |
+            <uint64_t>p[6] << 48 |
+            <uint64_t>p[7] << 56)
+
+cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
+                           uint64_t* v2, uint64_t* v3) nogil:
+    v0[0] += v1[0]
+    v1[0] = _rotl(v1[0], 13)
+    v1[0] ^= v0[0]
+    v0[0] = _rotl(v0[0], 32)
+    v2[0] += v3[0]
+    v3[0] = _rotl(v3[0], 16)
+    v3[0] ^= v2[0]
+    v0[0] += v3[0]
+    v3[0] = _rotl(v3[0], 21)
+    v3[0] ^= v0[0]
+    v2[0] += v1[0]
+    v1[0] = _rotl(v1[0], 17)
+    v1[0] ^= v2[0]
+    v2[0] = _rotl(v2[0], 32)
+
+cpdef uint64_t siphash(bytes data, bytes key) except? 0:
+    if len(key) != 16:
+        raise ValueError(
+            'key should be a 16-byte bytestring, got {!r} (len {})'.format(
+                key, len(key)))
+    return low_level_siphash(data, len(data), key)
+
+
+@cython.cdivision(True)
+cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
+                                uint8_t* key) nogil:
+    cdef uint64_t v0 = 0x736f6d6570736575ULL
+    cdef uint64_t v1 = 0x646f72616e646f6dULL
+    cdef uint64_t v2 = 0x6c7967656e657261ULL
+    cdef uint64_t v3 = 0x7465646279746573ULL
+    cdef uint64_t b
+    cdef uint64_t k0 = u8to64_le(key)
+    cdef uint64_t k1 = u8to64_le(key + 8)
+    cdef uint64_t m
+    cdef int i
+    cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
+    cdef int left = datalen & 7
+    cdef int left_byte
+
+    b = (<uint64_t>datalen) << 56
+    v3 ^= k1
+    v2 ^= k0
+    v1 ^= k1
+    v0 ^= k0
+
+    while (data != end):
+        m = u8to64_le(data)
+        v3 ^= m
+        for i in range(cROUNDS):
+            _sipround(&v0, &v1, &v2, &v3)
+        v0 ^= m
+
+        data += sizeof(uint64_t)
+
+    for i in range(left-1, -1, -1):
+        b |= (<uint64_t>data[i]) << (i * 8)
+
+    v3 ^= b
+
+    for i in range(cROUNDS):
+        _sipround(&v0, &v1, &v2, &v3)
+
+    v0 ^= b
+    v2 ^= 0xff
+
+    for i in range(dROUNDS):
+        _sipround(&v0, &v1, &v2, &v3)
+
+    b = v0 ^ v1 ^ v2 ^ v3
+
+    return b
diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py
@@ -0,0 +1,137 @@
+"""
+data hash pandas / numpy objects
+"""
+
+import numpy as np
+from pandas import _hash, Series, factorize, Categorical, Index
+from pandas.lib import infer_dtype
+from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
+from pandas.types.common import is_categorical_dtype
+
+# 16 byte long hashing key
+_default_hash_key = '0123456789123456'
+
+
+def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
+    """
+    Return a data hash of the Index/Series/DataFrame
+
+    .. versionadded:: 0.19.2
+
+    Parameters
+    ----------
+    index : boolean, default True
+        include the index in the hash (if Series/DataFrame)
+    encoding : string, default 'utf8'
+        encoding for data & key when strings
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    Series of uint64, same length as the object
+
+    """
+    if hash_key is None:
+        hash_key = _default_hash_key
+
+    def adder(h, hashed_to_add):
+        h = np.multiply(h, np.uint(3), h)
+        return np.add(h, hashed_to_add, h)
+
+    if isinstance(obj, ABCIndexClass):
+        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        h = Series(h, index=obj, dtype='uint64')
+    elif isinstance(obj, ABCSeries):
+        h = hash_array(obj.values, encoding, hash_key).astype('uint64')
+        if index:
+            h = adder(h, hash_pandas_object(obj.index,
+                                            index=False,
+                                            encoding=encoding,
+                                            hash_key=hash_key).values)
+        h = Series(h, index=obj.index, dtype='uint64')
+    elif isinstance(obj, ABCDataFrame):
+        cols = obj.iteritems()
+        first_series = next(cols)[1]
+        h = hash_array(first_series.values, encoding,
+                       hash_key).astype('uint64')
+        for _, col in cols:
+            h = adder(h, hash_array(col.values, encoding, hash_key))
+        if index:
+            h = adder(h, hash_pandas_object(obj.index,
+                                            index=False,
+                                            encoding=encoding,
+                                            hash_key=hash_key).values)
+
+        h = Series(h, index=obj.index, dtype='uint64')
+    else:
+        raise TypeError("Unexpected type for hashing %s" % type(obj))
+    return h
+
+
+def hash_array(vals, encoding='utf8', hash_key=None):
+    """
+    Given a 1d array, return an array of deterministic integers.
+
+    .. versionadded:: 0.19.2
+
+    Parameters
+    ----------
+    vals : ndarray
+    encoding : string, default 'utf8'
+        encoding for data & key when strings
+    hash_key : string key to encode, default to _default_hash_key
+
+    Returns
+    -------
+    1d uint64 numpy array of hash values, same length as the vals
+
+    """
+
+    # work with cagegoricals as ints. (This check is above the complex
+    # check so that we don't ask numpy if categorical is a subdtype of
+    # complex, as it will choke.
+    if hash_key is None:
+        hash_key = _default_hash_key
+
+    if is_categorical_dtype(vals.dtype):
+        vals = vals.codes
+
+    # we'll be working with everything as 64-bit values, so handle this
+    # 128-bit value early
+    if np.issubdtype(vals.dtype, np.complex128):
+        return hash_array(vals.real) + 23 * hash_array(vals.imag)
+
+    # MAIN LOGIC:
+    inferred = infer_dtype(vals)
+
+    # First, turn whatever array this is into unsigned 64-bit ints, if we can
+    # manage it.
+    if inferred == 'boolean':
+        vals = vals.astype('u8')
+
+    if (np.issubdtype(vals.dtype, np.datetime64) or
+       np.issubdtype(vals.dtype, np.timedelta64) or
+       np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:
+
+        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
+    else:
+
+        # its MUCH faster to categorize object dtypes, then hash and rename
+        codes, categories = factorize(vals, sort=False)
+        categories = Index(categories)
+        c = Series(Categorical(codes, categories,
+                               ordered=False, fastpath=True))
+        vals = _hash.hash_object_array(categories.values,
+                                       hash_key,
+                                       encoding)
+
+        # rename & extract
+        vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values
+
+    # Then, redistribute these 64-bit ints within the space of 64-bit ints
+    vals ^= vals >> 30
+    vals *= np.uint64(0xbf58476d1ce4e5b9)
+    vals ^= vals >> 27
+    vals *= np.uint64(0x94d049bb133111eb)
+    vals ^= vals >> 31
+    return vals