Skip to content

Commit

Permalink
[x86] split more 256-bit stores of concatenated vectors
Browse files Browse the repository at this point in the history
As suggested in D62498 - collectConcatOps() matches both
concat_vectors and insert_subvector patterns, and we see
more test improvements by using the more general match.

llvm-svn: 362620
  • Loading branch information
rotateright committed Jun 5, 2019
1 parent 5659b36 commit 2bf8287
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 124 deletions.
7 changes: 4 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -21085,9 +21085,10 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
// halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
if (StoreVT.is256BitVector()) {
if (StoredVal.getOpcode() != ISD::CONCAT_VECTORS || !StoredVal.hasOneUse())
return SDValue();
return splitVectorStore(St, DAG);
SmallVector<SDValue, 4> CatOps;
if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
return splitVectorStore(St, DAG);
return SDValue();
}

assert(StoreVT.isVector() && StoreVT.getSizeInBits() == 64 &&
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/memset-nonzero.ll
Expand Up @@ -259,9 +259,8 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi)
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: memset_32_nonconst_bytes:
Expand Down
157 changes: 73 additions & 84 deletions llvm/test/CodeGen/X86/oddshuffles.ll
Expand Up @@ -1186,18 +1186,17 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
; AVX1-NEXT: vmovdqu (%rcx), %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,0,0,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi)
; AVX1-NEXT: vmovups %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,u,u,4,5,6,7,u,u,8,9,10,11]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vmovdqu %xmm0, (%rdi)
; AVX1-NEXT: vmovdqu %xmm4, 32(%rdi)
; AVX1-NEXT: vmovdqu %xmm2, 16(%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i16_in:
Expand Down Expand Up @@ -1802,17 +1801,16 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, <96 x i8> *%a2) {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vmovaps %ymm2, 64(%rdi)
; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat3_128:
Expand Down Expand Up @@ -1842,34 +1840,29 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, <96 x i8> *%a2) {
;
; XOP-LABEL: splat3_128:
; XOP: # %bb.0:
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
; XOP-NEXT: vpperm %xmm7, %xmm5, %xmm3, %xmm3
; XOP-NEXT: vpperm %xmm7, %xmm0, %xmm6, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOP-NEXT: vpperm %xmm7, %xmm6, %xmm5, %xmm3
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; XOP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; XOP-NEXT: vpshufb %xmm5, %xmm4, %xmm3
; XOP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; XOP-NEXT: vmovaps %ymm1, 32(%rdi)
; XOP-NEXT: vmovaps %ymm0, (%rdi)
; XOP-NEXT: vmovaps %ymm2, 64(%rdi)
; XOP-NEXT: vzeroupper
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm2, %xmm2
; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0
; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4
; XOP-NEXT: vpperm %xmm3, %xmm1, %xmm6, %xmm1
; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7
; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3
; XOP-NEXT: vmovdqa %xmm3, 80(%rdi)
; XOP-NEXT: vmovdqa %xmm7, 64(%rdi)
; XOP-NEXT: vmovdqa %xmm1, 48(%rdi)
; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
; XOP-NEXT: vmovdqa %xmm2, 16(%rdi)
; XOP-NEXT: vmovdqa %xmm0, (%rdi)
; XOP-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
Expand Down Expand Up @@ -1980,16 +1973,16 @@ define void @splat3_256(<32 x i8> %a0, <96 x i8> *%a1) {
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm3
; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vmovaps %ymm2, 64(%rdi)
; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm0, (%rdi)
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi)
; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
Expand Down Expand Up @@ -2018,34 +2011,30 @@ define void @splat3_256(<32 x i8> %a0, <96 x i8> *%a1) {
;
; XOP-LABEL: splat3_256:
; XOP: # %bb.0:
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
; XOP-NEXT: vpperm %xmm7, %xmm5, %xmm3, %xmm3
; XOP-NEXT: vpperm %xmm7, %xmm0, %xmm6, %xmm0
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; XOP-NEXT: vpperm %xmm7, %xmm6, %xmm5, %xmm3
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; XOP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; XOP-NEXT: vpshufb %xmm5, %xmm4, %xmm3
; XOP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; XOP-NEXT: vmovaps %ymm1, 32(%rdi)
; XOP-NEXT: vmovaps %ymm0, (%rdi)
; XOP-NEXT: vmovaps %ymm2, 64(%rdi)
; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
; XOP-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm7 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10]
; XOP-NEXT: vpperm %xmm3, %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpperm %xmm3, %xmm0, %xmm7, %xmm0
; XOP-NEXT: vpperm %xmm3, %xmm7, %xmm4, %xmm4
; XOP-NEXT: vpperm %xmm3, %xmm2, %xmm6, %xmm2
; XOP-NEXT: vpperm %xmm3, %xmm5, %xmm8, %xmm7
; XOP-NEXT: vpperm %xmm3, %xmm6, %xmm5, %xmm3
; XOP-NEXT: vmovdqa %xmm3, 80(%rdi)
; XOP-NEXT: vmovdqa %xmm7, 64(%rdi)
; XOP-NEXT: vmovdqa %xmm2, 48(%rdi)
; XOP-NEXT: vmovdqa %xmm4, 32(%rdi)
; XOP-NEXT: vmovdqa %xmm1, 16(%rdi)
; XOP-NEXT: vmovdqa %xmm0, (%rdi)
; XOP-NEXT: vzeroupper
; XOP-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
Expand Down
68 changes: 34 additions & 34 deletions llvm/test/CodeGen/X86/x86-interleaved-access.ll
Expand Up @@ -1153,16 +1153,16 @@ define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, 64(%rdi)
; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
; AVX1-NEXT: vmovups %ymm1, (%rdi)
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vmovdqu %xmm3, 80(%rdi)
; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi)
; AVX1-NEXT: vmovdqu %xmm5, 48(%rdi)
; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi)
; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
; AVX1-NEXT: vmovdqu %xmm1, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
Expand Down Expand Up @@ -1255,37 +1255,37 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
; AVX1-NEXT: # xmm6 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm1
; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm4
; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm4
; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm5
; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm6
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
; AVX1-NEXT: vmovups %ymm5, 160(%rdi)
; AVX1-NEXT: vmovups %ymm4, 128(%rdi)
; AVX1-NEXT: vmovups %ymm3, 96(%rdi)
; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
; AVX1-NEXT: vmovups %ymm2, (%rdi)
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm14
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm12
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm4, %xmm7, %xmm7
; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm11
; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm9
; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm2
; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm4
; AVX1-NEXT: vmovdqu %xmm1, 80(%rdi)
; AVX1-NEXT: vmovdqu %xmm7, 64(%rdi)
; AVX1-NEXT: vmovdqu %xmm6, 16(%rdi)
; AVX1-NEXT: vmovdqu %xmm14, (%rdi)
; AVX1-NEXT: vmovdqu %xmm0, 48(%rdi)
; AVX1-NEXT: vmovdqu %xmm12, 32(%rdi)
; AVX1-NEXT: vmovdqu %xmm4, 176(%rdi)
; AVX1-NEXT: vmovdqu %xmm3, 160(%rdi)
; AVX1-NEXT: vmovdqu %xmm5, 112(%rdi)
; AVX1-NEXT: vmovdqu %xmm11, 96(%rdi)
; AVX1-NEXT: vmovdqu %xmm2, 144(%rdi)
; AVX1-NEXT: vmovdqu %xmm9, 128(%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
Expand Down

0 comments on commit 2bf8287

Please sign in to comment.