Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions numpy/_core/src/common/simd/avx512/avx512.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
// Enough limit to allow us to use _mm512_i32gather_* and _mm512_i32scatter_*
#define NPY_SIMD_MAXLOAD_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE32 (0x7fffffff / 16)
#define NPY_SIMD_MAXLOAD_STRIDE64 (0x7fffffff / 16)
#define NPY_SIMD_MAXSTORE_STRIDE64 (0x7fffffff / 16)

typedef __m512i npyv_u8;
typedef __m512i npyv_s8;
Expand Down
28 changes: 0 additions & 28 deletions numpy/_core/src/umath/fast_loop_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,34 +323,6 @@ abs_ptrdiff(char *a, char *b)
((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \
((abs_ptrdiff(args[1], args[0]) == 0))))

/*
* Avoid using SIMD for very large step sizes for several reasons:
* 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions,
* in which case we need two i64gather instructions and an additional vinsertf32x8
* instruction to load a single zmm register (since one i64gather instruction
* loads into a ymm register). This is not ideal for performance.
* 2) Gather and scatter instructions can be slow when the loads/stores
* cross page boundaries.
*
* We instead rely on i32gather/scatter_ps instructions which use a 32-bit index
* element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE
* ensures this. The condition also requires that the input and output arrays
* should have no overlap in memory.
*/
#define IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP \
((labs(steps[0]) < MAX_STEP_SIZE) && \
(labs(steps[1]) < MAX_STEP_SIZE) && \
(labs(steps[2]) < MAX_STEP_SIZE) && \
(nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
(nomemoverlap(args[1], steps[1], args[2], steps[2], dimensions[0])))

#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
((labs(steps[0]) < MAX_STEP_SIZE) && \
(labs(steps[1]) < MAX_STEP_SIZE) && \
(labs(steps[2]) < MAX_STEP_SIZE) && \
(nomemoverlap(args[0], steps[0], args[2], steps[2], dimensions[0])) && \
(nomemoverlap(args[0], steps[0], args[1], steps[1], dimensions[0])))

/*
* 1) Output should be contiguous, can handle strided input data
* 2) Input step should be smaller than MAX_STEP_SIZE for performance
Expand Down
13 changes: 11 additions & 2 deletions numpy/_core/src/umath/loops_exponent_log.dispatch.c.src
Original file line number Diff line number Diff line change
Expand Up @@ -1350,12 +1350,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
* #TYPE = FLOAT, DOUBLE#
* #c = f, #
* #C = F, #
* #suffix = f32, f64#
*/
NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_frexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
if ((npyv_loadable_stride_@suffix@(steps[0])) &&
(npyv_storable_stride_@suffix@(steps[1])) &&
(npyv_storable_stride_@suffix@(steps[2])) &&
(!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
(!is_mem_overlap(args[0], steps[0], args[1], steps[1], dimensions[0]))) {
AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
return;
}
Expand All @@ -1370,7 +1375,11 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_ldexp)
(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
{
#ifdef SIMD_AVX512_SKX
if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
if ((npyv_loadable_stride_@suffix@(steps[0])) &&
(npyv_storable_stride_@suffix@(steps[1])) &&
(npyv_storable_stride_@suffix@(steps[2])) &&
(!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0])) &&
(!is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0]))) {
AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
return;
}
Expand Down