Skip to content

SLP vectorization causes 1.75x slowdown with march=skylake-avx512 due to vgatherdps #70259

Open
@abadams

Description

@abadams

I think the SLP cost model might be wrong for vector gathers on skylake.

Consider the following code which repeatedly permutes an array:

void f(const float *__restrict__ src, const int *__restrict__ idx, float *__restrict__ dst, int n) {
    for (int i = 0; i < n; i += 8) {
        dst[i] = src[idx[i]];
        dst[i + 1] = src[idx[i + 1]];
        dst[i + 2] = src[idx[i + 2]];
        dst[i + 3] = src[idx[i + 3]];
        dst[i + 4] = src[idx[i + 4]];
        dst[i + 5] = src[idx[i + 5]];
        dst[i + 6] = src[idx[i + 6]];
        dst[i + 7] = src[idx[i + 7]];
    }
}

int main(int argc, char **argv) {
    const int n = 16 * 1024;
    float src[n], dst[n];
    int idx[n];
    for (int i = 0; i < n; i++) {
        // Some arbitrary permutation
        idx[i] = (i * 17 + 37) % n;
        src[i] = dst[i] = 0;
    }
    src[17] = 17.0f;

    for (int i = 0; i < 100000; i++) {
        f(&src[0], &idx[0], &dst[0], n);
        f(&dst[0], &idx[0], &src[0], n);
    }

    // Introduce a dependence on the output
    return dst[0] == 17.0f ? 1 : 0;
}

Compiled with top of tree clang with -march=skylake -O3 it takes about 4.2 seconds to run on my i9-9960X. Compiled with -march=skylake -O3 -fno-slp-vectorization it takes 2.4 seconds.

The only salient difference in the assembly is that slp vectorization has packed the eight stores in f into a gather intrinsic. Here's the inner loop assembly with slp vectorization on (with unrolling off for brevity):

.LBB1_4:                                # %for.body.i
                                        #   Parent Loop BB1_3 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	vmovups	32(%rsp,%rcx,4), %ymm0
	vpxor	%xmm1, %xmm1, %xmm1
	vpcmpeqd	%ymm2, %ymm2, %ymm2
	vgatherdps	%ymm2, (%r14,%ymm0,4), %ymm1
	vmovups	%ymm1, 65568(%rsp,%rcx,4)
	addq	$8, %rcx
	cmpq	$16376, %rcx                    # imm = 0x3FF8
	jb	.LBB1_4

and here it is with slp vectorization off:

.LBB1_4:                                # %for.body.i
                                        #   Parent Loop BB1_3 Depth=1
                                        # =>  This Inner Loop Header: Depth=2
	movslq	131104(%rsp,%rcx,4), %rdx
	vmovss	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovss	%xmm0, 32(%rsp,%rcx,4)
	movslq	131108(%rsp,%rcx,4), %rdx
	vmovss	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovss	%xmm0, 36(%rsp,%rcx,4)
	movslq	131112(%rsp,%rcx,4), %rdx
	vmovss	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovss	%xmm0, 40(%rsp,%rcx,4)
	movslq	131116(%rsp,%rcx,4), %rdx
	vmovss	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovss	%xmm0, 44(%rsp,%rcx,4)
	movslq	131120(%rsp,%rcx,4), %rdx
	vmovss	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovss	%xmm0, 48(%rsp,%rcx,4)
	movslq	131124(%rsp,%rcx,4), %rdx
	vmovss	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovss	%xmm0, 52(%rsp,%rcx,4)
	movslq	131128(%rsp,%rcx,4), %rdx
	vmovss	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovss	%xmm0, 56(%rsp,%rcx,4)
	movslq	131132(%rsp,%rcx,4), %rdx
	vmovd	65536(%rsp,%rdx,4), %xmm0       # xmm0 = mem[0],zero,zero,zero
	vmovd	%xmm0, 60(%rsp,%rcx,4)
	addq	$8, %rcx
	cmpq	$16376, %rcx                    # imm = 0x3FF8
	jb	.LBB1_4

Interestingly, llvm-mca has the right idea. It says the version with SLP vectorization on is 2310 cycles per 100 iterations, and the version with it off is 813 cycles per 100 iterations.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions