Skip to content
Browse files

Auto-tune OpenMP size

  • Loading branch information...
1 parent 5c8409c commit e413b45483d451c261044cc5fb588643df2badbc @markflorisson committed
Showing with 149 additions and 20 deletions.
  1. +16 −6 Cython/Compiler/Vector.py
  2. +36 −0 Cython/Utility/Vector.c
  3. +96 −13 Cython/Utility/Vector.pyx
  4. +1 −1 Cython/minivect
View
22 Cython/Compiler/Vector.py
@@ -59,6 +59,8 @@ class CythonSpecializerMixin(object):
has_error_handler = False
def visit_FunctionNode(self, node):
+ b = self.astbuilder
+
def qualify(type):
type = type.qualify("const", "CYTHON_RESTRICT")
type.base_type = type.base_type.qualify("const")
@@ -75,12 +77,13 @@ def qualify(type):
if arg.strides_pointer:
arg.strides_pointer.type = qualify(arg.strides_pointer.type)
+ type = minitypes.Py_ssize_t.qualify("const")
if self.is_tiled_specializer:
- type = minitypes.Py_ssize_t.qualify("const")
- self._blocksize_var = self.astbuilder.variable(type, 'blocksize')
- node.scalar_arguments.append(
- self.astbuilder.funcarg(self._blocksize_var))
+ self._blocksize_var = b.variable(type, 'blocksize')
+ node.scalar_arguments.append(b.funcarg(self._blocksize_var))
+ node.omp_size = b.variable(type, 'omp_size')
+ node.scalar_arguments.append(b.funcarg(node.omp_size))
node = super(CythonSpecializerMixin, self).visit_FunctionNode(node)
return node
@@ -735,9 +738,14 @@ def put_specialized_call(self, code, specializer, specialized_function,
args.append(result)
args.extend(scalar_arg.result() for scalar_arg in self.scalar_operands)
+
+ n_operands = len(self.operands)
if specializer.is_tiled_specializer:
dtype_decl = self.type.dtype.declaration_code("")
- args.append("__pyx_vector_get_tile_size() / sizeof(%s)" % dtype_decl)
+ args.append("__pyx_vector_get_tile_size(sizeof(%s), %d)" % (
+ dtype_decl, n_operands))
+ args.append("__pyx_vector_get_omp_size(%d)" % n_operands)
+
call = "%s(%s)" % (specialized_function.mangled_name, ", ".join(args))
if self.may_error:
@@ -1418,4 +1426,6 @@ def load_vector_cy_utility(name, context, **kwargs):
array_order_utility = load_vector_utility("GetOrder", context)
restrict_utility = load_vector_utility(
"RestrictUtility", context, proto_block='utility_code_proto_before_types')
-tile_size_utility = load_vector_cy_utility("GetTileSize", context)
+omp_size_utility = load_vector_utility("OpenMPAutoTune", context)
+tile_size_utility = load_vector_cy_utility("GetTileSize", context,
+ requires=[omp_size_utility])
View
36 Cython/Utility/Vector.c
@@ -76,3 +76,39 @@ __pyx_get_arrays_ordering(const {{memviewslice_name}} **ops, const int *ndims,
#define CYTHON_RESTRICT
#endif
#endif
+
+////////// OpenMPAutoTune.proto /////////
+static CYTHON_INLINE int __pyx_compiled_with_openmp(void);
+static CYTHON_INLINE void __pyx_test_sequential(double *a, int upper_limit);
+static CYTHON_INLINE void __pyx_test_parallel(double *a, int upper_limit);
+
+////////// OpenMPAutoTune /////////
+static CYTHON_INLINE
+int __pyx_compiled_with_openmp(void)
+{
+#ifdef _OPENMP
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+static CYTHON_INLINE
+void __pyx_test_sequential(double *a, const int upper_limit)
+{
+ int i;
+ for (i = 0; i < upper_limit - 1; i++) {
+ a[i] = a[i] + a[i + 1];
+ }
+}
+static CYTHON_INLINE
+void __pyx_test_parallel(double *a, const int upper_limit)
+{
+ int i;
+ #ifdef _OPENMP
+ #pragma omp parallel for
+ #endif
+ for (i = 0; i < upper_limit - 1; i++) {
+ a[i] = a[i] + a[i + 1];
+ }
+}
View
109 Cython/Utility/Vector.pyx
@@ -2,12 +2,12 @@
DEF _DEBUG = False
-DEF MIN_BLOCKSIZE = 256
+DEF MIN_BLOCKSIZE = 128
DEF SIZE = 1600
DEF MAX_TRIES = 4
DEF N_SAMPLES = 10
-import sys, types, time
+import sys, types, time, math
cdef extern from "stdlib.h":
void *malloc(size_t) nogil
@@ -17,21 +17,26 @@ cdef extern from "stdlib.h":
cdef bint have_tile_size = False
cdef Py_ssize_t tile_size
-cdef Py_ssize_t get_tile_size() except 0:
- if not have_tile_size:
- create_tile_size()
- return tile_size
-
-cdef int create_tile_size() except -1:
+cdef get_module():
modname = '__pyx_array_expressions'
mod = sys.modules.get(modname)
if not mod:
mod = types.ModuleType(modname)
- import time; t = time.time()
- mod.tile_size = compute_tile_size()
- if _DEBUG:
- print "total time:", time.time() - t, "tile size:", mod.tile_size
sys.modules[modname] = mod
+ return mod
+
+cdef Py_ssize_t get_tile_size(Py_ssize_t itemsize, int n_operands) except 0:
+ if not have_tile_size:
+ create_tile_size()
+
+ result = tile_size / (itemsize * (float(n_operands) / 2))
+ digits = int(round(math.log(result, 2)))
+ return max(2 ** digits, MIN_BLOCKSIZE / itemsize)
+
+cdef int create_tile_size() except -1:
+ mod = get_module()
+ if not hasattr(mod, 'tile_size'):
+ mod.tile_size = compute_tile_size()
# assigning to globals is broken in utility codes
(&have_tile_size)[0] = True
@@ -86,7 +91,7 @@ cdef try_blocksize(float *a, float *b, Py_ssize_t blocksize):
return time.time() - t
-cdef inline tile(float *a, float *b, Py_ssize_t blocksize):
+cdef void tile(float *a, float *b, Py_ssize_t blocksize):
cdef Py_ssize_t i, j, tiled_i, tiled_j, upper_i, upper_j
for tiled_i in range(0, SIZE, blocksize):
@@ -98,3 +103,81 @@ cdef inline tile(float *a, float *b, Py_ssize_t blocksize):
a[i * SIZE + j] += b[i + j * SIZE]
# print __pyx_get_tile_size()
+
+# cached minimum OpenMP size for the 'if' clause
+
+DEF MIN_OMP_SIZE = 512
+DEF N_OMP_SAMPLES = 10
+DEF MAX_OMP_TRIES = 10
+
+cdef extern from *:
+ bint __pyx_compiled_with_openmp()
+ void __pyx_test_sequential(double *a, int upper_limit)
+ void __pyx_test_parallel(double *a, int upper_limit)
+
+
+cdef bint have_omp_size = False
+cdef Py_ssize_t omp_size
+
+cdef Py_ssize_t get_omp_size(int n_operands) except -1:
+ if not have_omp_size:
+ if _DEBUG:
+ t = time.time()
+ create_omp_size()
+ t = time.time() - t
+ print "OMP_SIZE", omp_size, "took", t, "seconds"
+ else:
+ create_omp_size()
+
+ return omp_size / n_operands
+
+cdef int create_omp_size() except -1:
+ mod = get_module()
+ if not hasattr(mod, 'omp_size'):
+ mod.omp_size = compute_omp_size()
+
+ # assigning to globals is broken in utility codes
+ (&have_omp_size)[0] = True
+ (&omp_size)[0] = mod.omp_size
+ return 0
+
+cdef Py_ssize_t compute_omp_size() except 0:
+ if not __pyx_compiled_with_openmp():
+ if _DEBUG:
+ print "Skipping omp tuning, not compiled with OpenMP"
+ return 1
+
+ cdef double *a = <double *> malloc(sizeof(double) * MIN_OMP_SIZE * 2 ** MAX_OMP_TRIES)
+ cdef int i, j
+
+ best_time = float('inf')
+ cdef Py_ssize_t best_size = MIN_OMP_SIZE
+
+ try:
+ for i in range(MAX_OMP_TRIES):
+ # warm up
+ __pyx_test_sequential(a, best_size)
+ sequential_time = time.time()
+ for j in range(N_OMP_SAMPLES):
+ __pyx_test_sequential(a, best_size)
+ sequential_time = time.time() - sequential_time
+
+ # multi-core warm up & potential OpenMP initialization
+ __pyx_test_parallel(a, best_size)
+ parallel_time = time.time()
+ for j in range(N_OMP_SAMPLES):
+ __pyx_test_parallel(a, best_size)
+ parallel_time = time.time() - parallel_time
+
+ if _DEBUG:
+ print "size:", best_size, "time:", sequential_time, parallel_time
+
+ if parallel_time < sequential_time:
+ return best_size
+
+ best_size *= 2
+ finally:
+ free(a)
+
+ return best_size
+
2 Cython/minivect
@@ -1 +1 @@
-Subproject commit 05fa4208d0213c7455bd0e9c053f4a51ef475e3f
+Subproject commit d706122424d7367f2702d9a9f297b3d61e5132c0

0 comments on commit e413b45

Please sign in to comment.
Something went wrong with that request. Please try again.