Skip to content

Commit

Permalink
[X86] lowerV4I64Shuffle - try harder to lower to PERMQ(BLENDD(V1,V2))…
Browse files Browse the repository at this point in the history
… pattern
  • Loading branch information
RKSimon committed May 15, 2022
1 parent df5ea2b commit 32162cf
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 103 deletions.
5 changes: 5 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -17725,6 +17725,11 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
return V;

// Try to lower to PERMQ(BLENDD(V1,V2)).
if (SDValue V =
lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
return V;

// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle. However, if we have AVX2 and either inputs are already in place,
// we will be able to shuffle even across lanes the other input in a single
Expand Down
168 changes: 84 additions & 84 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
Expand Up @@ -121,22 +121,22 @@ define void @load_i64_stride3_vf4(<12 x i64>* %in.vec, <4 x i64>* %out.vec0, <4
;
; AVX2-LABEL: load_i64_stride3_vf4:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
; AVX2-NEXT: vmovaps (%rdi), %ymm1
; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm1
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: vmovdqa %ymm2, (%rsi)
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vmovaps %ymm2, (%rsi)
; AVX2-NEXT: vmovaps %ymm0, (%rdx)
; AVX2-NEXT: vmovaps %ymm1, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
Expand Down Expand Up @@ -251,26 +251,26 @@ define void @load_i64_stride3_vf8(<24 x i64>* %in.vec, <8 x i64>* %out.vec0, <8
;
; AVX2-LABEL: load_i64_stride3_vf8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm5
; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpbroadcastq 176(%rdi), %ymm2
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
; AVX2-NEXT: vmovaps (%rdi), %ymm1
; AVX2-NEXT: vmovaps 128(%rdi), %ymm2
; AVX2-NEXT: vmovaps 96(%rdi), %ymm3
; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4
; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5],ymm5[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm5
; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm1
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm2
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vmovaps 16(%rdi), %xmm2
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = mem[0,1,0,3]
Expand All @@ -279,10 +279,10 @@ define void @load_i64_stride3_vf8(<24 x i64>* %in.vec, <8 x i64>* %out.vec0, <8
; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = mem[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
; AVX2-NEXT: vmovdqa %ymm5, (%rsi)
; AVX2-NEXT: vmovdqa %ymm4, 32(%rsi)
; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vmovaps %ymm5, (%rsi)
; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
; AVX2-NEXT: vmovaps %ymm0, (%rdx)
; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
; AVX2-NEXT: vmovaps %ymm2, (%rcx)
; AVX2-NEXT: vzeroupper
Expand Down Expand Up @@ -497,46 +497,46 @@ define void @load_i64_stride3_vf16(<48 x i64>* %in.vec, <16 x i64>* %out.vec0, <
;
; AVX2-LABEL: load_i64_stride3_vf16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3
; AVX2-NEXT: vmovdqa 192(%rdi), %ymm6
; AVX2-NEXT: vmovdqa 320(%rdi), %ymm5
; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8
; AVX2-NEXT: vmovdqa (%rdi), %ymm9
; AVX2-NEXT: vmovdqa 128(%rdi), %ymm10
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm11
; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm11[0,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vinserti128 $1, 352(%rdi), %ymm0, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm6[0,3,2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7]
; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpbroadcastq 272(%rdi), %ymm6
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpbroadcastq 176(%rdi), %ymm10
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpbroadcastq 80(%rdi), %ymm9
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7]
; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm7[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vpbroadcastq 368(%rdi), %ymm7
; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
; AVX2-NEXT: vmovaps 224(%rdi), %ymm3
; AVX2-NEXT: vmovaps 192(%rdi), %ymm5
; AVX2-NEXT: vmovaps 320(%rdi), %ymm6
; AVX2-NEXT: vmovaps 288(%rdi), %ymm7
; AVX2-NEXT: vmovaps 32(%rdi), %ymm8
; AVX2-NEXT: vmovaps (%rdi), %ymm9
; AVX2-NEXT: vmovaps 128(%rdi), %ymm10
; AVX2-NEXT: vmovaps 96(%rdi), %ymm11
; AVX2-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5],ymm1[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm9[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm0, %ymm2
; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm7[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX2-NEXT: vinsertf128 $1, 256(%rdi), %ymm0, %ymm4
; AVX2-NEXT: vpermpd {{.*#+}} ymm12 = ymm5[0,3,2,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm3[4,5],ymm12[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vbroadcastsd 272(%rdi), %ymm5
; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vbroadcastsd 176(%rdi), %ymm10
; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vbroadcastsd 80(%rdi), %ymm9
; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vbroadcastsd 368(%rdi), %ymm7
; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
; AVX2-NEXT: vmovaps 112(%rdi), %xmm7
; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
Expand All @@ -553,14 +553,14 @@ define void @load_i64_stride3_vf16(<48 x i64>* %in.vec, <16 x i64>* %out.vec0, <
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = mem[0,1,0,3]
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
; AVX2-NEXT: vmovdqa %ymm4, 64(%rsi)
; AVX2-NEXT: vmovdqa %ymm15, 96(%rsi)
; AVX2-NEXT: vmovdqa %ymm14, (%rsi)
; AVX2-NEXT: vmovdqa %ymm13, 32(%rsi)
; AVX2-NEXT: vmovdqa %ymm5, 96(%rdx)
; AVX2-NEXT: vmovdqa %ymm8, (%rdx)
; AVX2-NEXT: vmovdqa %ymm6, 32(%rdx)
; AVX2-NEXT: vmovdqa %ymm3, 64(%rdx)
; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
; AVX2-NEXT: vmovaps %ymm15, 96(%rsi)
; AVX2-NEXT: vmovaps %ymm14, (%rsi)
; AVX2-NEXT: vmovaps %ymm13, 32(%rsi)
; AVX2-NEXT: vmovaps %ymm6, 96(%rdx)
; AVX2-NEXT: vmovaps %ymm8, (%rdx)
; AVX2-NEXT: vmovaps %ymm5, 32(%rdx)
; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
; AVX2-NEXT: vmovaps %ymm2, 64(%rcx)
; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
; AVX2-NEXT: vmovaps %ymm0, (%rcx)
Expand Down
56 changes: 37 additions & 19 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
Expand Up @@ -1267,14 +1267,14 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_3254:
; AVX2: # %bb.0:
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4i64_3254:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_3254:
Expand All @@ -1285,8 +1285,8 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_3254:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4>
ret <4 x i64> %shuffle
Expand Down Expand Up @@ -1379,18 +1379,36 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
}

define <4 x i64> @shuffle_v4i64_2741(<4 x i64> %a, <4 x i64> %b) {
; AVX1OR2-LABEL: shuffle_v4i64_2741:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[0,1]
; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX1OR2-NEXT: retq
; AVX1-LABEL: shuffle_v4i64_2741:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
; AVX1-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_2741:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1]
; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
; AVX2-LABEL: shuffle_v4i64_2741:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v4i64_2741:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_2741:
; AVX512VL-FAST-ALL: # %bb.0:
; AVX512VL-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,4,1]
; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
; AVX512VL-FAST-ALL-NEXT: retq
;
; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_2741:
; AVX512VL-FAST-PERLANE: # %bb.0:
; AVX512VL-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 7, i32 4, i32 1>
ret <4 x i64> %shuffle
}
Expand Down Expand Up @@ -1767,8 +1785,8 @@ define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
;
; AVX2-LABEL: shuffle_v4i64_1234:
; AVX2: # %bb.0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1]
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1234:
Expand Down

0 comments on commit 32162cf

Please sign in to comment.