Permalink
Browse files

Support vectorized specializations

  • Loading branch information...
1 parent d41d090 commit ebc88508d8935874a91e580d856f0010a099d72d @markflorisson committed Jul 18, 2012
Showing with 166 additions and 52 deletions.
  1. +128 −48 Cython/Compiler/Vector.py
  2. +6 −1 Cython/Utility/MemoryView_C.c
  3. +27 −2 Cython/Utility/Vector.c
  4. +1 −1 Cython/minivect
  5. +4 −0 tests/array_expressions/broadcasting.pyx
View
@@ -16,6 +16,8 @@
_debug = False
_context_debug = False
+cython_vector_size = "CYTHON_VECTOR_SIZE"
+
class TypeMapper(minitypes.TypeMapper):
def map_type(self, type, wrap=False):
if type.is_typedef:
@@ -116,7 +118,7 @@ def create_hybrid_code(codegen, old_minicode):
code.codegen = codegen.clone(codegen.context, code)
return code
-class CCodeGen(codegen.CCodeGen):
+class CCodeGen(codegen.VectorCodegen):
def __init__(self, context, codewriter):
super(CCodeGen, self).__init__(context, codewriter)
@@ -543,6 +545,7 @@ def analyse_types(self, env):
axes=axes)
self.target.analyse_types(env)
+ self.max_ndim = max(self.dst.type.ndim, self.target.type.ndim)
self.type = self.target.type
def align_with_lhs(self, code):
@@ -583,21 +586,109 @@ def align_with_lhs(self, code):
def result(self):
return self.target.result()
- def run_specializer(self, specializer):
+ def run_specializer(self, code, specializer, guard=None, counter=None):
+ """
+ Run a given minivect specializer and optionally surround the prototype
+ and implementation code with a preprocessor guard.
+ """
codes = self.context.run(self.function, [specializer])
- return iter(codes).next()
+ specializer, ast, codewriter, (proto, impl) = iter(codes).next()
+
+ if guard is not None:
+ proto = "%s\n%s\n#endif\n" % (guard, proto)
+ impl = "%s\n%s\n#endif\n" % (guard, impl)
+
+ if counter is None:
+ counter = self.get_function_counter(proto, impl)
+
+ self.put_specialization(code, specializer, ast, codewriter,
+ proto, impl, counter)
+ return specializer, ast, codewriter, (proto, impl), counter
+
+ def get_function_counter(self, proto, impl):
+ code_counter = self.code_counter
+ filename = getattr(self.pos[0], 'filename', None)
+ if filename is not None:
+ key = self.pos[0].filename, proto, impl
+ if key in self._code_cache:
+ code_counter = self._code_cache[key]
+ else:
+ self._code_cache[key] = code_counter
+ SpecializationCaller.code_counter += 1
+ else:
+ SpecializationCaller.code_counter += 1
+
+ return code_counter
- def put_specialization(self, code, specializer):
- self.put_specialized_call(code, *self.run_specializer(specializer))
+ def put_specialization(self, code, specializer, specialized_function,
+ codewriter, proto, impl, code_counter):
+ "Insert generated minivect code into the Cython module"
+ # print id(specialized_function), specialized_function.mangled_name
+ specialized_function.mangled_name = (
+ specialized_function.mangled_name % (code_counter,))
+ proto = proto % code_counter
+ impl = impl % ((code_counter,) * impl.count("%d"))
+
+ utility = Code.UtilityCode(proto=proto, impl=impl)
+ code.globalstate.use_utility_code(utility)
+
+ if _debug:
+ marker = '-' * 20
+ print marker, 'proto', marker
+ print proto
+ print marker, 'impl', marker
+ print impl
+
+ def put_specialization_and_call(self, code, specializer, guard=None,
+ counter=None, can_vectorize=True):
+ "Generate code, inject it into the Cython module and generate a call"
+ if specializer.vectorized_equivalents and can_vectorize:
+ self.put_vectorized_specializations(code, specializer)
+ else:
+ # Generate code
+ result = self.run_specializer(code, specializer, guard, counter)
+ # Generate call to the generated code
+ self.put_specialized_call(code, *result)
+
+ def put_vectorized_specializations(self, code, normal_specializer):
+ """
+ Generate several versions of the code, one for SSE*, one for AVX
+ and one unvectorized version.
+ """
+ sse_specializer, avx_specializer = (
+ normal_specializer.vectorized_equivalents)
+
+ can_vectorize = sse_specializer.can_vectorize(self.context,
+ self.function)
+
+ if can_vectorize:
+ guard = "#if %s == %%d" % cython_vector_size
+ sse_size = sse_specializer.vector_size
+ avx_size = avx_specializer.vector_size
+ sse_guard = guard % sse_size
+ avx_guard = guard % avx_size
+ normal_guard = "#if !(%s == %d || %s == %d)" % (
+ cython_vector_size, sse_size, cython_vector_size, avx_size)
+
+ _, _, _, _, counter = self.run_specializer(
+ code, normal_specializer, guard=normal_guard)
+ self.run_specializer(code, sse_specializer, guard=sse_guard,
+ counter=counter)
+ self.put_specialization_and_call(code, avx_specializer,
+ guard=avx_guard, counter=counter)
+ else:
+ self.put_specialization_and_call(code, normal_specializer,
+ can_vectorize=False)
def is_c_order_code(self):
return "%s & __PYX_ARRAY_C_ORDER" % self.array_layout.result()
- def put_ordered_specializations(self, code, c_specialization, f_specialization):
+ def put_ordered_specializations(self, code, c_specialization,
+ f_specialization):
code.putln("if (%s) {" % self.is_c_order_code())
- self.put_specialization(code, c_specialization)
+ self.put_specialization_and_call(code, c_specialization)
code.putln("} else {")
- self.put_specialization(code, f_specialization)
+ self.put_specialization_and_call(code, f_specialization)
code.putln("}")
def _put_contig_specialization(self, code, if_clause, contig, mixed_contig):
@@ -611,7 +702,10 @@ def _put_contig_specialization(self, code, if_clause, contig, mixed_contig):
self.array_layout.result(), not_broadcasting)
code.putln("%s (%s) {" % (if_clause, condition))
- self.put_specialization(code, specializers.ContigSpecializer)
+
+ self.put_specialization_and_call(code,
+ specializers.ContigSpecializer)
+
code.putln("}")
if_clause = "else if"
@@ -624,20 +718,26 @@ def _put_tiled_specialization(self, code, if_clause, mixed_contig):
(if_clause, self.array_layout.result()))
code.putln("/* Tiled specializations */")
- self.put_ordered_specializations(code, specializers.CTiledStridedSpecializer,
- specializers.FTiledStridedSpecializer)
+ self.put_ordered_specializations(code,
+ specializers.CTiledStridedSpecializer,
+ specializers.FTiledStridedSpecializer)
if not mixed_contig:
code.putln("}")
return if_clause
def _put_inner_contig_specializations(self, code, if_clause, mixed_contig):
- if not mixed_contig:
+ """
+ Insert the inner contiguous specialization, if we have more than two
+ dimensions. We the rhs is 2D but the LHS 1D, it means the actual
+ pattern is 1D.
+ """
+ if (not mixed_contig and self.dst.type.ndim > 1 and
+ self.target.type.ndim > 1):
code.putln("%s (%s & __PYX_ARRAYS_ARE_INNER_CONTIG) {" %
(if_clause, self.array_layout.result()))
- self.put_ordered_specializations(
- code,
+ self.put_ordered_specializations(code,
specializers.StridedCInnerContigSpecializer,
specializers.StridedFortranInnerContigSpecializer)
code.putln("}")
@@ -686,39 +786,11 @@ def contig_condition(self, specializer):
return MemoryView.get_best_slice_order(self.target)
def put_specialized_call(self, code, specializer, specialized_function,
- codewriter, result_code):
- proto, impl = result_code
-
- code_counter = self.code_counter
- filename = getattr(self.pos[0], 'filename', None)
- if filename is not None:
- key = self.pos[0].filename, proto, impl
- if key in self._code_cache:
- code_counter = self._code_cache[key]
- else:
- self._code_cache[key] = code_counter
- SpecializationCaller.code_counter += 1
- else:
- SpecializationCaller.code_counter += 1
-
- specialized_function.mangled_name = (
- specialized_function.mangled_name % code_counter)
- proto = proto % code_counter
- impl = impl % ((code_counter,) * impl.count("%d"))
-
- function = self.function
- ndim = function.ndim
-
- utility = Code.UtilityCode(proto=proto, impl=impl)
- code.globalstate.use_utility_code(utility)
-
- if _debug:
- marker = '-' * 20
- print marker, 'proto', marker
- print proto
- print marker, 'impl', marker
- print impl
-
+ codewriter, result_code, code_counter=None):
+ """
+ Generate a call to a given specializer and specialized
+ minivect function.
+ """
# all function call arguments
offset = max(self.target.type.ndim - self.function.ndim, 0)
args = ["&%s.shape[%d]" % (self.result(), offset)]
@@ -1431,6 +1503,8 @@ def load_utilities(env):
env.use_utility_code(array_order_utility)
env.use_utility_code(restrict_utility)
env.use_utility_code(tile_size_utility)
+ env.use_utility_code(vector_size_utility)
+ env.use_utility_code(vector_header_utility)
def load_vector_utility(name, context, **kwargs):
return Code.TempitaUtilityCode.load(name, "Vector.c", context=context, **kwargs)
@@ -1440,7 +1514,7 @@ def load_vector_cy_utility(name, context, **kwargs):
name, "Vector.pyx", context=context,
prefix='__pyx_vector_', **kwargs)
-context = MemoryView.context
+context = dict(MemoryView.context, cython_vector_size=cython_vector_size)
broadcast_utility = MemoryView.load_memview_cy_utility("Broadcasting",
context=context)
@@ -1453,3 +1527,9 @@ def load_vector_cy_utility(name, context, **kwargs):
omp_size_utility = load_vector_utility("OpenMPAutoTune", context)
tile_size_utility = load_vector_cy_utility("GetTileSize", context,
requires=[omp_size_utility])
+vector_size_utility = load_vector_utility(
+ "VectorizedUtility", context=context,
+ proto_block='utility_code_proto_before_types')
+
+vector_header_utility = load_vector_utility("VectorHeaderUtility",
+ context=context)
@@ -653,7 +653,7 @@ __pyx_memviewslice_is_contig2(const {{memviewslice_name}} mvs,
for (i = 0; i < ndim; i++) {
index = start + step * i;
- if (mvs.suboffsets[index] >= 0 || mvs.strides[index] != itemsize)
+ if (mvs.strides[index] != itemsize)
return 0;
itemsize *= mvs.shape[index];
@@ -666,6 +666,11 @@ static int
__pyx_memviewslice_is_contig(const {{memviewslice_name}} mvs,
char order, int ndim)
{
+ int i;
+ for (i = 0; i < ndim; i++)
+ if (mvs.suboffsets[i] >= 0)
+ return 0;
+
return __pyx_memviewslice_is_contig2(mvs, order, ndim, mvs.memview->view.itemsize);
}
View
@@ -29,7 +29,7 @@ __pyx_get_arrays_ordering(const {{memviewslice_name}} **ops, const int *ndims,
for (i = 0; i < nops; i++) {
char order = __pyx_get_best_slice_order(*ops[i], ndims[i]);
- int contig = __pyx_memviewslice_is_contig2(*ops[i], 'C', ndims[i], itemsizes[i]);
+ int contig = __pyx_memviewslice_is_contig2(*ops[i], order, ndims[i], itemsizes[i]);
if (order == 'C') {
all_f_contig = 0;
@@ -51,12 +51,24 @@ __pyx_get_arrays_ordering(const {{memviewslice_name}} **ops, const int *ndims,
} else if (seen_c_ish && seen_f_ish) {
return __PYX_ARRAYS_ARE_MIXED_STRIDED | (seen_c_ish > seen_f_ish);
} else {
+ /*
+ Check whether the operands are strided or inner contiguous.
+ We check whether the stride in the first or last (F/C) dimension equals
+ the itemsize, and we verify that no operand is broadcasting in the
+ first or last (F/C) dimension (that they all have the same extent).
+ */
+ Py_ssize_t extent;
+
+ if (seen_c_ish) extent = ops[0]->shape[ndims[0] - 1];
+ else extent = ops[0]->shape[0];
+
for (i = 0; i < nops; i++) {
int dim = 0;
if (seen_c_ish)
dim = ndims[i] - 1;
- if (ops[i]->strides[dim] != itemsizes[i])
+ if (ops[i]->strides[dim] != itemsizes[i] ||
+ ops[i]->shape[dim] != extent)
return __PYX_ARRAYS_ARE_STRIDED | !!seen_c_ish;
}
}
@@ -77,6 +89,18 @@ __pyx_get_arrays_ordering(const {{memviewslice_name}} **ops, const int *ndims,
#endif
#endif
+////////// VectorizedUtility.proto //////////
+#ifndef {{cython_vector_size}}
+ #define {{cython_vector_size}} 0
+#endif
+
+////////// VectorHeaderUtility.proto ////////////
+#if {{cython_vector_size}} == 4
+#include <xmmintrin.h>
+#elif {{cython_vector_size}} == 8
+#include <smmintrin.h>
+#endif
+
////////// OpenMPAutoTune.proto /////////
static CYTHON_INLINE int __pyx_compiled_with_openmp(void);
static CYTHON_INLINE void __pyx_test_sequential(double *a, int upper_limit);
@@ -112,3 +136,4 @@ void __pyx_test_parallel(double *a, const int upper_limit)
a[i] = a[i] + a[i + 1];
}
}
+
@@ -71,15 +71,19 @@ def test_broadcasting_c_contig(fused_dtype_t[::1] m1, fused_dtype_t[:, ::1] m2):
def test_broadcasting_larger_rhs_ndim(double[:] m):
"""
>>> test_broadcasting_larger_rhs_ndim(np.arange(10, dtype=np.double))
+ array([ 18., 16., 14., 12., 10., 8., 6., 4., 2., 0.])
"""
m[:] = m[None, ::-1] + m[None, ::-1]
+ return np.asarray(m)
@testcase
def test_broadcasting_larger_rhs_ndim_contig(double[::1] m):
"""
>>> test_broadcasting_larger_rhs_ndim_contig(np.arange(10, dtype=np.double))
+ array([ 18., 16., 14., 12., 10., 8., 6., 4., 2., 0.])
"""
m[:] = m[None, ::-1] + m[None, ::-1]
+ return np.asarray(m)
@testcase
def test_broadcasting_runtime(double[:, :] m1, double[:, :] m2):

0 comments on commit ebc8850

Please sign in to comment.