diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 4c81fae4ca927..fc81e86585384 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1627,3 +1627,305 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x store <256 x i8> %interleaved, <256 x i8>* %p ret void } + +define void @splat2_v4f64_load_store(<4 x double>* %s, <8 x double>* %d) { +; AVX1-LABEL: splat2_v4f64_load_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) +; AVX1-NEXT: vmovupd %ymm1, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat2_v4f64_load_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovups %ymm0, 32(%rsi) +; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: splat2_v4f64_load_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %x = load <4 x double>, <4 x double>* %s, align 8 + %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> + %r = shufflevector <8 x double> %x2, <8 x double> undef, <8 x i32> + store <8 x double> %r, <8 x double>* %d, align 8 + ret void +} + +define void @splat2_v4i64_load_store(<4 x i64>* %s, <8 x i64>* %d) { +; AVX1-LABEL: splat2_v4i64_load_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,3] +; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) +; AVX1-NEXT: vmovupd %ymm1, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat2_v4i64_load_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovups %ymm0, 32(%rsi) +; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: splat2_v4i64_load_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3] +; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovups %zmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %x = load <4 x i64>, <4 x i64>* %s, align 8 + %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> + %r = shufflevector <8 x i64> %x2, <8 x i64> undef, <8 x i32> + store <8 x i64> %r, <8 x i64>* %d, align 8 + ret void +} + +define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) { +; AVX1-LABEL: splat4_v8f32_load_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: vmovups 16(%rdi), %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vmovups %xmm0, 48(%rsi) +; AVX1-NEXT: vmovups %xmm7, 32(%rsi) +; AVX1-NEXT: vmovups %xmm6, 16(%rsi) +; AVX1-NEXT: vmovups %xmm5, (%rsi) +; AVX1-NEXT: vmovups %xmm1, 112(%rsi) +; AVX1-NEXT: vmovups %xmm4, 96(%rsi) +; AVX1-NEXT: vmovups %xmm3, 80(%rsi) +; AVX1-NEXT: vmovups %xmm2, 64(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat4_v8f32_load_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vmovups (%rdi), %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovups %ymm0, 96(%rsi) +; AVX2-NEXT: vmovups %ymm3, 64(%rsi) +; AVX2-NEXT: vmovups %ymm1, 32(%rsi) +; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: splat4_v8f32_load_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] +; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %x = load <8 x float>, <8 x float>* %s, align 4 + %x2 = shufflevector <8 x float> %x, <8 x float> undef, <16 x i32> + %x4 = shufflevector <16 x float> %x2, <16 x float> undef, <32 x i32> + %r = shufflevector <32 x float> %x4, <32 x float> undef, <32 x i32> + store <32 x float> %r, <32 x float>* %d, align 4 + ret void +} + +define void @splat4_v8i32_load_store(<8 x i32>* %s, <32 x i32>* %d) { +; AVX1-LABEL: splat4_v8i32_load_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovups (%rdi), %xmm0 +; AVX1-NEXT: vmovups 16(%rdi), %xmm1 +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1] +; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vmovups %xmm0, 48(%rsi) +; AVX1-NEXT: vmovups %xmm7, 32(%rsi) +; AVX1-NEXT: vmovups %xmm6, 16(%rsi) +; AVX1-NEXT: vmovups %xmm5, (%rsi) +; AVX1-NEXT: vmovups %xmm1, 112(%rsi) +; AVX1-NEXT: vmovups %xmm4, 96(%rsi) +; AVX1-NEXT: vmovups %xmm3, 80(%rsi) +; AVX1-NEXT: vmovups %xmm2, 64(%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat4_v8i32_load_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vmovups (%rdi), %xmm1 +; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX2-NEXT: vmovups %ymm0, 96(%rsi) +; AVX2-NEXT: vmovups %ymm3, 64(%rsi) +; AVX2-NEXT: vmovups %ymm1, 32(%rsi) +; AVX2-NEXT: vmovups %ymm2, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: splat4_v8i32_load_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,0,8,1,9,1,9,2,10,2,10,3,11,3,11] +; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,12,4,12,5,13,5,13,6,14,6,14,7,15,7,15] +; AVX512-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, 64(%rsi) +; AVX512-NEXT: vmovdqu64 %zmm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %x = load <8 x i32>, <8 x i32>* %s, align 4 + %x2 = shufflevector <8 x i32> %x, <8 x i32> undef, <16 x i32> + %x4 = shufflevector <16 x i32> %x2, <16 x i32> undef, <32 x i32> + %r = shufflevector <32 x i32> %x4, <32 x i32> undef, <32 x i32> + store <32 x i32> %r, <32 x i32>* %d, align 4 + ret void +} + +define void @splat4_v4f64_load_store(<4 x double>* %s, <16 x double>* %d) { +; AVX1-LABEL: splat4_v4f64_load_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm1[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} ymm3 = ymm0[0,0,2,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,3,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; AVX1-NEXT: vmovupd %ymm0, 96(%rsi) +; AVX1-NEXT: vmovupd %ymm3, 64(%rsi) +; AVX1-NEXT: vmovupd %ymm1, 32(%rsi) +; AVX1-NEXT: vmovupd %ymm2, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat4_v4f64_load_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vmovups %ymm0, 96(%rsi) +; AVX2-NEXT: vmovups %ymm2, 64(%rsi) +; AVX2-NEXT: vmovups %ymm3, 32(%rsi) +; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: splat4_v4f64_load_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovups %zmm0, 64(%rsi) +; AVX512-NEXT: vmovups %zmm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %x = load <4 x double>, <4 x double>* %s, align 8 + %x2 = shufflevector <4 x double> %x, <4 x double> undef, <8 x i32> + %x4 = shufflevector <8 x double> %x2, <8 x double> undef, <16 x i32> + %r = shufflevector <16 x double> %x4, <16 x double> undef, <16 x i32> + store <16 x double> %r, <16 x double>* %d, align 8 + ret void +} + +define void @splat4_v4i64_load_store(<4 x i64>* %s, <16 x i64>* %d) { +; AVX1-LABEL: splat4_v4i64_load_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1,0,1] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm1[0,0,2,2] +; AVX1-NEXT: vmovddup {{.*#+}} ymm3 = ymm0[0,0,2,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,3,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; AVX1-NEXT: vmovupd %ymm0, 96(%rsi) +; AVX1-NEXT: vmovupd %ymm3, 64(%rsi) +; AVX1-NEXT: vmovupd %ymm1, 32(%rsi) +; AVX1-NEXT: vmovupd %ymm2, (%rsi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat4_v4i64_load_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovups (%rdi), %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: vmovups %ymm0, 96(%rsi) +; AVX2-NEXT: vmovups %ymm2, 64(%rsi) +; AVX2-NEXT: vmovups %ymm3, 32(%rsi) +; AVX2-NEXT: vmovups %ymm1, (%rsi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: splat4_v4i64_load_store: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 +; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,2,2,2] +; AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[1,1,1,1] +; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovups %zmm0, 64(%rsi) +; AVX512-NEXT: vmovups %zmm1, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %x = load <4 x i64>, <4 x i64>* %s, align 8 + %x2 = shufflevector <4 x i64> %x, <4 x i64> undef, <8 x i32> + %x4 = shufflevector <8 x i64> %x2, <8 x i64> undef, <16 x i32> + %r = shufflevector <16 x i64> %x4, <16 x i64> undef, <16 x i32> + store <16 x i64> %r, <16 x i64>* %d, align 8 + ret void +}