Skip to content

Commit

Permalink
ENH: add data hashing routines (#14729)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Nov 28, 2016
1 parent c5f219a commit 06f26b5
Show file tree
Hide file tree
Showing 5 changed files with 498 additions and 2 deletions.
33 changes: 33 additions & 0 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from pandas.util import testing as tm


class algorithm(object):
Expand Down Expand Up @@ -55,3 +56,35 @@ def time_add_overflow_neg_arr(self):

def time_add_overflow_mixed_arr(self):
self.checked_add(self.arr, self.arrmixed)


class hashing(object):
goal_time = 0.2

def setup(self):
N = 100000

self.df = pd.DataFrame(
{'A': pd.Series(tm.makeStringIndex(100).take(
np.random.randint(0, 100, size=N))),
'B': pd.Series(tm.makeStringIndex(10000).take(
np.random.randint(0, 10000, size=N))),
'D': np.random.randn(N),
'E': np.arange(N),
'F': pd.date_range('20110101', freq='s', periods=N),
'G': pd.timedelta_range('1 day', freq='s', periods=N),
})
self.df['C'] = self.df['B'].astype('category')
self.df.iloc[10:20] = np.nan

def time_frame(self):
self.df.hash()

def time_series_int(self):
self.df.E.hash()

def time_series_string(self):
self.df.B.hash()

def time_series_categorical(self):
self.df.C.hash()
180 changes: 180 additions & 0 deletions pandas/src/hash.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# cython: profile=False
# Translated from the reference implementation
# at https://github.com/veorq/SipHash

import cython
cimport numpy as cnp
import numpy as np
from numpy cimport ndarray, uint8_t, uint32_t, uint64_t

from cpython cimport (PyString_Check,
PyBytes_Check,
PyUnicode_Check)
from libc.stdlib cimport malloc, free

DEF cROUNDS = 2
DEF dROUNDS = 4


@cython.boundscheck(False)
def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
"""
Parameters
----------
arr : 1-d object ndarray of objects
key : hash key, must be 16 byte len encoded
encoding : encoding for key & arr, default to 'utf8'
Returns
-------
1-d uint64 ndarray of hashes
"""
cdef:
Py_ssize_t i, l, n
ndarray[uint64_t] result
bytes data, k
uint8_t *kb, *lens
char **vecs, *cdata
object val

k = <bytes>key.encode(encoding)
kb = <uint8_t *>k
if len(k) != 16:
raise ValueError(
'key should be a 16-byte string encoded, got {!r} (len {})'.format(
k, len(k)))

n = len(arr)

# create an array of bytes
vecs = <char **> malloc(n * sizeof(char *))
lens = <uint8_t*> malloc(n * sizeof(uint8_t))

cdef list datas = []
for i in range(n):
val = arr[i]
if PyString_Check(val):
data = <bytes>val.encode(encoding)
elif PyBytes_Check(val):
data = <bytes>val
elif PyUnicode_Check(val):
data = <bytes>val.encode(encoding)
else:
# non-strings
data = <bytes>str(val).encode(encoding)

l = len(data)
lens[i] = l
cdata = data

# keep the refernce alive thru the end of the
# function
datas.append(data)
vecs[i] = cdata

result = np.empty(n, dtype=np.uint64)
with nogil:
for i in range(n):
result[i] = low_level_siphash(<uint8_t *>vecs[i], lens[i], kb)

free(vecs)
free(lens)
return result

cdef inline uint64_t _rotl(uint64_t x, uint64_t b) nogil:
return (x << b) | (x >> (64 - b))

cdef inline void u32to8_le(uint8_t* p, uint32_t v) nogil:
p[0] = <uint8_t>(v)
p[1] = <uint8_t>(v >> 8)
p[2] = <uint8_t>(v >> 16)
p[3] = <uint8_t>(v >> 24)

cdef inline void u64to8_le(uint8_t* p, uint64_t v) nogil:
u32to8_le(p, <uint32_t>v)
u32to8_le(p + 4, <uint32_t>(v >> 32))

cdef inline uint64_t u8to64_le(uint8_t* p) nogil:
return (<uint64_t>p[0] |
<uint64_t>p[1] << 8 |
<uint64_t>p[2] << 16 |
<uint64_t>p[3] << 24 |
<uint64_t>p[4] << 32 |
<uint64_t>p[5] << 40 |
<uint64_t>p[6] << 48 |
<uint64_t>p[7] << 56)

cdef inline void _sipround(uint64_t* v0, uint64_t* v1,
uint64_t* v2, uint64_t* v3) nogil:
v0[0] += v1[0]
v1[0] = _rotl(v1[0], 13)
v1[0] ^= v0[0]
v0[0] = _rotl(v0[0], 32)
v2[0] += v3[0]
v3[0] = _rotl(v3[0], 16)
v3[0] ^= v2[0]
v0[0] += v3[0]
v3[0] = _rotl(v3[0], 21)
v3[0] ^= v0[0]
v2[0] += v1[0]
v1[0] = _rotl(v1[0], 17)
v1[0] ^= v2[0]
v2[0] = _rotl(v2[0], 32)

cpdef uint64_t siphash(bytes data, bytes key) except? 0:
if len(key) != 16:
raise ValueError(
'key should be a 16-byte bytestring, got {!r} (len {})'.format(
key, len(key)))
return low_level_siphash(data, len(data), key)


@cython.cdivision(True)
cdef uint64_t low_level_siphash(uint8_t* data, size_t datalen,
uint8_t* key) nogil:
cdef uint64_t v0 = 0x736f6d6570736575ULL
cdef uint64_t v1 = 0x646f72616e646f6dULL
cdef uint64_t v2 = 0x6c7967656e657261ULL
cdef uint64_t v3 = 0x7465646279746573ULL
cdef uint64_t b
cdef uint64_t k0 = u8to64_le(key)
cdef uint64_t k1 = u8to64_le(key + 8)
cdef uint64_t m
cdef int i
cdef uint8_t* end = data + datalen - (datalen % sizeof(uint64_t))
cdef int left = datalen & 7
cdef int left_byte

b = (<uint64_t>datalen) << 56
v3 ^= k1
v2 ^= k0
v1 ^= k1
v0 ^= k0

while (data != end):
m = u8to64_le(data)
v3 ^= m
for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)
v0 ^= m

data += sizeof(uint64_t)

for i in range(left-1, -1, -1):
b |= (<uint64_t>data[i]) << (i * 8)

v3 ^= b

for i in range(cROUNDS):
_sipround(&v0, &v1, &v2, &v3)

v0 ^= b
v2 ^= 0xff

for i in range(dROUNDS):
_sipround(&v0, &v1, &v2, &v3)

b = v0 ^ v1 ^ v2 ^ v3

return b
137 changes: 137 additions & 0 deletions pandas/tools/hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
data hash pandas / numpy objects
"""

import numpy as np
from pandas import _hash, Series, factorize, Categorical, Index
from pandas.lib import infer_dtype
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
from pandas.types.common import is_categorical_dtype

# 16 byte long hashing key
_default_hash_key = '0123456789123456'


def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None):
"""
Return a data hash of the Index/Series/DataFrame
.. versionadded:: 0.19.2
Parameters
----------
index : boolean, default True
include the index in the hash (if Series/DataFrame)
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key
Returns
-------
Series of uint64, same length as the object
"""
if hash_key is None:
hash_key = _default_hash_key

def adder(h, hashed_to_add):
h = np.multiply(h, np.uint(3), h)
return np.add(h, hashed_to_add, h)

if isinstance(obj, ABCIndexClass):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
h = Series(h, index=obj, dtype='uint64')
elif isinstance(obj, ABCSeries):
h = hash_array(obj.values, encoding, hash_key).astype('uint64')
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)
h = Series(h, index=obj.index, dtype='uint64')
elif isinstance(obj, ABCDataFrame):
cols = obj.iteritems()
first_series = next(cols)[1]
h = hash_array(first_series.values, encoding,
hash_key).astype('uint64')
for _, col in cols:
h = adder(h, hash_array(col.values, encoding, hash_key))
if index:
h = adder(h, hash_pandas_object(obj.index,
index=False,
encoding=encoding,
hash_key=hash_key).values)

h = Series(h, index=obj.index, dtype='uint64')
else:
raise TypeError("Unexpected type for hashing %s" % type(obj))
return h


def hash_array(vals, encoding='utf8', hash_key=None):
"""
Given a 1d array, return an array of deterministic integers.
.. versionadded:: 0.19.2
Parameters
----------
vals : ndarray
encoding : string, default 'utf8'
encoding for data & key when strings
hash_key : string key to encode, default to _default_hash_key
Returns
-------
1d uint64 numpy array of hash values, same length as the vals
"""

# work with cagegoricals as ints. (This check is above the complex
# check so that we don't ask numpy if categorical is a subdtype of
# complex, as it will choke.
if hash_key is None:
hash_key = _default_hash_key

if is_categorical_dtype(vals.dtype):
vals = vals.codes

# we'll be working with everything as 64-bit values, so handle this
# 128-bit value early
if np.issubdtype(vals.dtype, np.complex128):
return hash_array(vals.real) + 23 * hash_array(vals.imag)

# MAIN LOGIC:
inferred = infer_dtype(vals)

# First, turn whatever array this is into unsigned 64-bit ints, if we can
# manage it.
if inferred == 'boolean':
vals = vals.astype('u8')

if (np.issubdtype(vals.dtype, np.datetime64) or
np.issubdtype(vals.dtype, np.timedelta64) or
np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:

vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
else:

# its MUCH faster to categorize object dtypes, then hash and rename
codes, categories = factorize(vals, sort=False)
categories = Index(categories)
c = Series(Categorical(codes, categories,
ordered=False, fastpath=True))
vals = _hash.hash_object_array(categories.values,
hash_key,
encoding)

# rename & extract
vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values

# Then, redistribute these 64-bit ints within the space of 64-bit ints
vals ^= vals >> 30
vals *= np.uint64(0xbf58476d1ce4e5b9)
vals ^= vals >> 27
vals *= np.uint64(0x94d049bb133111eb)
vals ^= vals >> 31
return vals
Loading

0 comments on commit 06f26b5

Please sign in to comment.