Merge pull request #3703 from stuartarchibald/wip/hash_impls

Implementations of type hashing.
numba · Feb 14, 2019 · 1f8292d · 1f8292d
2 parents 3f6f4a3 + 5ab801f
commit 1f8292d
Show file tree

Hide file tree

Showing 17 changed files with 1,068 additions and 161 deletions.
diff --git a/buildscripts/azure/azure-windows.yml b/buildscripts/azure/azure-windows.yml
@@ -53,6 +53,9 @@ jobs:
         # One of the tbb tests is failing on Azure.  Removing tbb from testing
         # until we can figure out why
         conda remove -y tbb tbb-devel
+        pushd bin
+        numba -s
+        popd
         python -m numba.tests.test_runtests
         python runtests.py -m 2 -b -- numba.tests
       displayName: 'Test'
diff --git a/docs/source/developer/hashing.rst b/docs/source/developer/hashing.rst
@@ -0,0 +1,57 @@
+
+================
+Notes on Hashing
+================
+
+Numba supports the built-in :func:`hash` and does so by simply calling the
+:func:`__hash__` member function on the supplied argument. This makes it
+trivial to add hash support for new types as all that is required is the
+application of the extension API :func:`overload_method` decorator to overload
+a function for computing the hash value for the new type registered to the
+type's :func:`__hash__` method. For example::
+
+    from numba.extending import overload_method
+
+    @overload_method(myType, '__hash__')
+    def myType_hash_overload(obj):
+        # implementation details
+
+
+The Implementation
+==================
+
+The implementation of the Numba hashing functions strictly follows that of
+Python 3. The only exception to this is that for hashing Unicode and bytes (for
+content longer than ``sys.hash_info.cutoff``) the only supported algorithm is
+``siphash24`` (default in CPython 3). As a result Numba will match Python 3
+hash values for all supported types under the default conditions described.
+Python 2 hashing support is set up to follow Python 3 and similar defaults are
+hard coded for this purpose, including, perhaps most noticeably,
+``sys.hash_info.cutoff`` is set to zero.
+
+Unicode hash cache differences
+------------------------------
+
+Both Numba and CPython Unicode string internal representations have a ``hash``
+member for the purposes of caching the string's hash value. This member is
+always checked ahead of computing a hash value the with view of simply providing
+a value from cache as it is considerably cheaper to do so. The Numba Unicode
+string hash caching implementation behaves in a similar way to that of
+CPython's. The only notable behavioral change (and its only impact is a minor
+potential change in performance) is that Numba always computes and caches the
+hash for Unicode strings created in ``nopython mode`` at the time they are boxed
+for reuse in Python, this is too eager in some cases in comparison to CPython
+which may delay hashing a new Unicode string depending on creation method. It
+should also be noted that Numba copies in the ``hash`` member of the CPython
+internal representation for Unicode strings when unboxing them to its own
+representation so as to not recompute the hash of a string that already has a
+hash value associated with it.
+
+The accommodation of ``PYTHONHASHSEED``
+---------------------------------------
+
+The ``PYTHONHASHSEED`` environment variable can be used to seed the CPython
+hashing algorithms for e.g. the purposes of reproduciblity. The Numba hashing
+implementation directly reads the CPython hashing algorithms' internal state and
+as a result the influence of ``PYTHONHASHSEED`` is replicated in Numba's
+hashing implementations.
diff --git a/docs/source/developer/index.rst b/docs/source/developer/index.rst
@@ -18,4 +18,5 @@ Developer Manual
    stencil.rst
    custom_pipeline.rst
    environment.rst
+   hashing.rst
    roadmap.rst
diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst
@@ -338,6 +338,7 @@ The following built-in functions are supported:
 * :func:`divmod`
 * :func:`enumerate`
 * :class:`float`
+* :func:`hash` (see :ref:`pysupported-hashing` below)
 * :class:`int`: only the one-argument form
 * :func:`iter`: only the one-argument form
 * :func:`len`
@@ -353,6 +354,25 @@ The following built-in functions are supported:
   (e.g. numbers and named tuples)
 * :func:`zip`
 
+.. _pysupported-hashing:
+
+Hashing
+-------
+
+The :func:`hash` built-in is supported and produces hash values for all
+supported hashable types with the following Python version specific behavior:
+
+Under Python 3, hash values computed by Numba will exactly match those computed
+in CPython under the condition that the :attr:`sys.hash_info.algorithm` is
+``siphash24`` (default).
+
+Under Python 2, hash values computed by Numba will follow the behavior
+described for Python 3 with the :attr:`sys.hash_info.algorithm` emulated as
+``siphash24``. No attempt is made to replicate Python 2 hashing behavior.
+
+The ``PYTHONHASHSEED`` environment variable influences the hashing behavior in
+precisely the manner described in the CPython documentation.
+
 
 Standard library modules
 ========================

diff --git a/numba/_helperlib.c b/numba/_helperlib.c
@@ -5,6 +5,7 @@
  */
 
 #include "_pymodule.h"
+#include <stddef.h>
 #include <stdio.h>
 #include <math.h>
 #include "_math_c99.h"
@@ -1063,12 +1064,55 @@ numba_unpickle(const char *data, int n)
  * Unicode helpers
  */
 
+/* Developer note:
+ *
+ * The hash value of unicode objects is obtained via:
+ * ((PyASCIIObject *)(obj))->hash;
+ * The use comes from this definition:
+ * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Objects/unicodeobject.c#L119-L120
+ * and it's used extensively throughout the `cpython/Object/unicodeobject.c`
+ * source, not least in `unicode_hash` itself:
+ * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Objects/unicodeobject.c#L11662-L11679
+ *
+ * The Unicode string struct layouts are described here:
+ * https://github.com/python/cpython/blob/6d43f6f081023b680d9db4542d19b9e382149f0a/Include/cpython/unicodeobject.h#L82-L161
+ * essentially, all the unicode string layouts start with a `PyASCIIObject` at
+ * offset 0 (as of commit 6d43f6f081023b680d9db4542d19b9e382149f0a, somewhere
+ * in the 3.8 development cycle).
+ *
+ * For safety against future CPython internal changes, the code checks that the
+ * _base members of the unicode structs are what is expected in 3.7, and that
+ * their offset is 0. It then walks the struct to the hash location to make sure
+ * the offset is indeed the same as PyASCIIObject->hash.
+ * Note: The large condition in the if should evaluate to a compile time
+ * constant.
+ */
+
+#define MEMBER_SIZE(structure, member) sizeof(((structure *)0)->member)
+
 NUMBA_EXPORT_FUNC(void *)
-numba_extract_unicode(PyObject *obj, Py_ssize_t *length, int *kind) {
+numba_extract_unicode(PyObject *obj, Py_ssize_t *length, int *kind,
+                      Py_ssize_t *hash) {
 #if (PY_MAJOR_VERSION >= 3) && (PY_MINOR_VERSION >= 3)
     if (!PyUnicode_READY(obj)) {
         *length = PyUnicode_GET_LENGTH(obj);
         *kind = PyUnicode_KIND(obj);
+        /* this is here as a crude check for safe casting of all unicode string
+         * structs to a PyASCIIObject */
+        if (MEMBER_SIZE(PyCompactUnicodeObject, _base) == sizeof(PyASCIIObject)             &&
+            MEMBER_SIZE(PyUnicodeObject, _base) == sizeof(PyCompactUnicodeObject)           &&
+            offsetof(PyCompactUnicodeObject, _base) == 0                                    &&
+            offsetof(PyUnicodeObject, _base) == 0                                           &&
+            offsetof(PyCompactUnicodeObject, _base.hash) == offsetof(PyASCIIObject, hash)   &&
+            offsetof(PyUnicodeObject, _base._base.hash) == offsetof(PyASCIIObject, hash)
+           ) {
+            /* Grab the hash from the type object cache, do not compute it. */
+            *hash = ((PyASCIIObject *)(obj))->hash;
+        }
+        else {
+            /* cast is not safe, fail */
+            return NULL;
+        }
         return PyUnicode_DATA(obj);
     } else {
         return NULL;

diff --git a/numba/dispatcher.py b/numba/dispatcher.py
@@ -643,7 +643,6 @@ def compile(self, sig):
             existing = self.overloads.get(tuple(args))
             if existing is not None:
                 return existing.entry_point
-
             # Try to load from disk cache
             cres = self._cache.load_overload(sig, self.targetctx)
             if cres is not None:

diff --git a/numba/pythonapi.py b/numba/pythonapi.py
@@ -1136,21 +1136,25 @@ def string_as_string_size_and_kind(self, strobj):
         The ``buffer`` is a i8* of the output buffer.
         The ``length`` is a i32/i64 (py_ssize_t) of the length of the buffer.
         The ``kind`` is a i32 (int32) of the Unicode kind constant
+        The ``hash`` is a long/uint64_t (py_hash_t) of the Unicode constant hash
         """
         if PYVERSION >= (3, 3):
             p_length = cgutils.alloca_once(self.builder, self.py_ssize_t)
             p_kind = cgutils.alloca_once(self.builder, Type.int())
+            p_hash = cgutils.alloca_once(self.builder, self.py_hash_t)
             fnty = Type.function(self.cstring, [self.pyobj,
                                                 self.py_ssize_t.as_pointer(),
-                                                Type.int().as_pointer()])
+                                                Type.int().as_pointer(),
+                                                self.py_hash_t.as_pointer()])
             fname = "numba_extract_unicode"
             fn = self._get_function(fnty, name=fname)
 
-            buffer = self.builder.call(fn, [strobj, p_length, p_kind])
+            buffer = self.builder.call(fn, [strobj, p_length, p_kind, p_hash])
             ok = self.builder.icmp_unsigned('!=',
                                             ir.Constant(buffer.type, None),
                                             buffer)
-            return (ok, buffer, self.builder.load(p_length), self.builder.load(p_kind))
+            return (ok, buffer, self.builder.load(p_length),
+                    self.builder.load(p_kind), self.builder.load(p_hash))
         else:
             assert False, 'not supported on Python < 3.3'
 
@@ -1188,6 +1192,12 @@ def bytes_from_string_and_size(self, string, size):
         fn = self._get_function(fnty, name=fname)
         return self.builder.call(fn, [string, size])
 
+    def object_hash(self, obj):
+        fnty = Type.function(self.py_hash_t, [self.pyobj,])
+        fname = "PyObject_Hash"
+        fn = self._get_function(fnty, name=fname)
+        return self.builder.call(fn, [obj,])
+
     def object_str(self, obj):
         fnty = Type.function(self.pyobj, [self.pyobj])
         fn = self._get_function(fnty, name="PyObject_Str")

diff --git a/numba/targets/base.py b/numba/targets/base.py
@@ -262,7 +262,7 @@ def refresh(self):
         # Populate built-in registry
         from . import (arraymath, enumimpl, iterators, linalg, numbers,
                        optional, polynomial, rangeobj, slicing, smartarray,
-                       tupleobj, gdb_hook)
+                       tupleobj, gdb_hook, hashing)
         try:
             from . import npdatetime
         except NotImplementedError:

diff --git a/numba/targets/boxing.py b/numba/targets/boxing.py
@@ -819,7 +819,7 @@ def _python_set_to_native(typ, obj, c, size, setptr, errorptr):
                 native = c.unbox(typ.dtype, itemobj)
                 with c.builder.if_then(native.is_error, likely=False):
                     c.builder.store(cgutils.true_bit, errorptr)
-                inst.add(native.value, do_resize=False)
+                inst.add_pyapi(c.pyapi, native.value, do_resize=False)
 
             if typ.reflected:
                 inst.parent = obj