diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3fc41e5b0bc1c..81da77b88bfa0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7557,6 +7557,19 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, } } + // REVERSE - attempt to match the loads in reverse and then shuffle back. + // TODO: Do this for any permute or mismatching element counts. + if (Depth == 0 && !ZeroMask && TLI.isTypeLegal(VT) && VT.isVector() && + NumElems == VT.getVectorNumElements()) { + SmallVector ReverseElts(Elts.rbegin(), Elts.rend()); + if (SDValue RevLd = EltsFromConsecutiveLoads( + VT, ReverseElts, DL, DAG, Subtarget, IsAfterLegalize, Depth + 1)) { + SmallVector ReverseMask(NumElems); + std::iota(ReverseMask.rbegin(), ReverseMask.rend(), 0); + return DAG.getVectorShuffle(VT, DL, RevLd, DAG.getUNDEF(VT), ReverseMask); + } + } + return SDValue(); } @@ -59490,8 +59503,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT, *FirstLd->getMemOperand(), &Fast) && Fast) { - if (SDValue Ld = - EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) + if (SDValue Ld = EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, + false, Depth + 1)) return Ld; } } diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll index 330c978d2a9f7..22c4ad28059e4 100644 --- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll +++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll @@ -844,13 +844,11 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind { ; AVX512-NEXT: vmovq %rcx, %xmm2 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovq %r8, %xmm1 -; AVX512-NEXT: vmovq %r9, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vmovq %r9, %xmm3 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vplzcntq %zmm0, %zmm1 ; AVX512-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 @@ -2071,13 +2069,11 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind { ; AVX512-NEXT: vmovq %rcx, %xmm2 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovq %r8, %xmm1 -; AVX512-NEXT: vmovq %r9, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1] +; AVX512-NEXT: vmovq %r8, %xmm2 +; AVX512-NEXT: vmovq %r9, %xmm3 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512-NEXT: vplzcntq %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll index 3edb712e53c8d..773eb8f6742e5 100644 --- a/llvm/test/CodeGen/X86/build-vector-256.ll +++ b/llvm/test/CodeGen/X86/build-vector-256.ll @@ -417,9 +417,8 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) { ; AVX1-32-LABEL: test_buildvector_4f64_2_var: ; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 -; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] +; AVX1-32-NEXT: vmovupd {{[0-9]+}}(%esp), %xmm0 +; AVX1-32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-32-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll index 3ced27f12c72a..18faec5747abe 100644 --- a/llvm/test/CodeGen/X86/chain_order.ll +++ b/llvm/test/CodeGen/X86/chain_order.ll @@ -6,9 +6,8 @@ define void @cftx020(ptr nocapture %a) { ; CHECK-LABEL: cftx020: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vmovupd (%rdi), %xmm1 ; CHECK-NEXT: vmovupd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 26f076d450c15..b6aae486dc315 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -354,53 +354,23 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp } define <4 x float> @merge_v4f32_f32_3210(ptr %ptr) nounwind uwtable noinline ssp { -; SSE2-LABEL: merge_v4f32_f32_3210: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: merge_v4f32_f32_3210: -; SSE41: # %bb.0: -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; SSE41-NEXT: retq +; SSE-LABEL: merge_v4f32_f32_3210: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq ; ; AVX-LABEL: merge_v4f32_f32_3210: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] ; AVX-NEXT: retq ; -; X86-SSE1-LABEL: merge_v4f32_f32_3210: -; X86-SSE1: # %bb.0: -; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; X86-SSE1-NEXT: retl -; -; X86-SSE41-LABEL: merge_v4f32_f32_3210: -; X86-SSE41: # %bb.0: -; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; X86-SSE41-NEXT: retl +; X86-SSE-LABEL: merge_v4f32_f32_3210: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups (%eax), %xmm0 +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-SSE-NEXT: retl %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 2 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 1 @@ -788,31 +758,15 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline s } define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp { -; SSE2-LABEL: merge_v4i32_i32_3210: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: merge_v4i32_i32_3210: -; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pinsrd $1, 8(%rdi), %xmm0 -; SSE41-NEXT: pinsrd $2, 4(%rdi), %xmm0 -; SSE41-NEXT: pinsrd $3, (%rdi), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: merge_v4i32_i32_3210: +; SSE: # %bb.0: +; SSE-NEXT: movdqu (%rdi), %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq ; ; AVX-LABEL: merge_v4i32_i32_3210: ; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] ; AVX-NEXT: retq ; ; X86-SSE1-LABEL: merge_v4i32_i32_3210: @@ -842,10 +796,8 @@ define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp { ; X86-SSE41-LABEL: merge_v4i32_i32_3210: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE41-NEXT: pinsrd $1, 8(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrd $2, 4(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrd $3, (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; X86-SSE41-NEXT: retl %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2 @@ -1003,55 +955,22 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ssp { ; SSE2-LABEL: merge_8i16_i16_76543210: ; SSE2: # %bb.0: -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl 2(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movzwl 4(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl 6(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movzwl 8(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl 10(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movzwl 12(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movzwl 14(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_8i16_i16_76543210: ; SSE41: # %bb.0: -; SSE41-NEXT: movzwl 14(%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pinsrw $1, 12(%rdi), %xmm0 -; SSE41-NEXT: pinsrw $2, 10(%rdi), %xmm0 -; SSE41-NEXT: pinsrw $3, 8(%rdi), %xmm0 -; SSE41-NEXT: pinsrw $4, 6(%rdi), %xmm0 -; SSE41-NEXT: pinsrw $5, 4(%rdi), %xmm0 -; SSE41-NEXT: pinsrw $6, 2(%rdi), %xmm0 -; SSE41-NEXT: pinsrw $7, (%rdi), %xmm0 +; SSE41-NEXT: movdqu (%rdi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_8i16_i16_76543210: ; AVX: # %bb.0: -; AVX-NEXT: movzwl 14(%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] ; AVX-NEXT: retq ; ; X86-SSE1-LABEL: merge_8i16_i16_76543210: @@ -1107,15 +1026,8 @@ define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ss ; X86-SSE41-LABEL: merge_8i16_i16_76543210: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movzwl 14(%eax), %ecx -; X86-SSE41-NEXT: movd %ecx, %xmm0 -; X86-SSE41-NEXT: pinsrw $1, 12(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrw $2, 10(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrw $3, 8(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrw $4, 6(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrw $5, 4(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrw $6, 2(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrw $7, (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] ; X86-SSE41-NEXT: retl %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 7 %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 6 @@ -1341,95 +1253,30 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { ; SSE2-LABEL: merge_16i8_i8_FEDCBA9876543210: ; SSE2: # %bb.0: -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl 1(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movzbl 2(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzbl 4(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl 5(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movzbl 6(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl 7(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movzbl 8(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl 9(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl 10(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl 11(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: movzbl 12(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl 13(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movzbl 14(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: movzbl 15(%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210: ; SSE41: # %bb.0: -; SSE41-NEXT: movzbl 15(%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pinsrb $1, 14(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $2, 13(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $3, 12(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $4, 11(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $5, 10(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $6, 9(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $7, 8(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $8, 7(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $9, 6(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $10, 5(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $11, 4(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $12, 3(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $13, 2(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $14, 1(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $15, (%rdi), %xmm0 +; SSE41-NEXT: movdqu (%rdi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_16i8_i8_FEDCBA9876543210: ; AVX: # %bb.0: -; AVX-NEXT: movzbl 15(%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrb $1, 14(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, 13(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, 12(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $4, 11(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $5, 10(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $6, 9(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $7, 8(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $8, 7(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, 6(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $10, 5(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $11, 4(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $12, 3(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $13, 2(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $14, 1(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; AVX-NEXT: retq ; ; X86-SSE1-LABEL: merge_16i8_i8_FEDCBA9876543210: @@ -1507,23 +1354,8 @@ define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noin ; X86-SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movzbl 15(%eax), %ecx -; X86-SSE41-NEXT: movd %ecx, %xmm0 -; X86-SSE41-NEXT: pinsrb $1, 14(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $2, 13(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $3, 12(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $4, 11(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $5, 10(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $6, 9(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $7, 8(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $8, 7(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $9, 6(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $10, 5(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $11, 4(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $12, 3(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $13, 2(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $14, 1(%eax), %xmm0 -; X86-SSE41-NEXT: pinsrb $15, (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] ; X86-SSE41-NEXT: retl %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 15 %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 14 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll index e5e99e17053a0..6ad306d2e6564 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -127,23 +127,27 @@ define <4 x double> @merge_4f64_f64_45zz(ptr %ptr) nounwind uwtable noinline ssp } define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_v4f64_f64_3210: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_v4f64_f64_3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_v4f64_f64_3210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_v4f64_f64_3210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_v4f64_f64_3210: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; X86-AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; X86-AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] ; X86-AVX-NEXT: retl %ptr0 = getelementptr inbounds double, ptr %ptr, i64 3 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2 @@ -269,16 +273,21 @@ define <4 x i64> @merge_4i64_i64_23zz(ptr %ptr) nounwind uwtable noinline ssp { } define <4 x i64> @merge_v4i64_i64_3210(ptr %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_v4i64_i64_3210: -; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_v4i64_i64_3210: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_v4i64_i64_3210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_v4i64_i64_3210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0] +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_v4i64_i64_3210: ; X86-AVX: # %bb.0: @@ -410,31 +419,29 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline } define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ssp { -; AVX-LABEL: merge_8f32_f32_76543210: -; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: merge_8f32_f32_76543210: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: merge_8f32_f32_76543210: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: merge_8f32_f32_76543210: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8f32_f32_76543210: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; X86-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; X86-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; X86-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; X86-AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; X86-AVX-NEXT: retl %ptr0 = getelementptr inbounds float, ptr %ptr, i64 7 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 6 @@ -545,55 +552,27 @@ define <8 x i32> @merge_8i32_i32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss define <8 x i32> @merge_8i32_i32_76543210(ptr %ptr) nounwind uwtable noinline ssp { ; AVX1-LABEL: merge_8i32_i32_76543210: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: merge_8i32_i32_76543210: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: merge_8i32_i32_76543210: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_8i32_i32_76543210: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpinsrd $1, 8(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrd $2, 4(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrd $3, (%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpinsrd $1, 24(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrd $2, 20(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrd $3, 16(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] +; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; X86-AVX-NEXT: retl %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 7 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 6 @@ -733,94 +712,36 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n define <16 x i16> @merge_16i16_i16_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { ; AVX1-LABEL: merge_16i16_i16_FEDCBA9876543210: ; AVX1: # %bb.0: -; AVX1-NEXT: movzwl 14(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 -; AVX1-NEXT: movzwl 30(%rdi), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vpinsrw $1, 28(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $2, 26(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $3, 24(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $4, 22(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $5, 20(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $6, 18(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $7, 16(%rdi), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: merge_16i16_i16_FEDCBA9876543210: ; AVX2: # %bb.0: -; AVX2-NEXT: movzwl 14(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 -; AVX2-NEXT: movzwl 30(%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpinsrw $1, 28(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $2, 26(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $3, 24(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $4, 22(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $5, 20(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $6, 18(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $7, 16(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: merge_16i16_i16_FEDCBA9876543210: ; AVX512F: # %bb.0: -; AVX512F-NEXT: movzwl 14(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vpinsrw $1, 12(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $2, 10(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $3, 8(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $4, 6(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $5, 4(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $6, 2(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: movzwl 30(%rdi), %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vpinsrw $1, 28(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrw $2, 26(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrw $3, 24(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrw $4, 22(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrw $5, 20(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrw $6, 18(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrw $7, 16(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512F-NEXT: retq ; ; X86-AVX-LABEL: merge_16i16_i16_FEDCBA9876543210: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movzwl 14(%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpinsrw $1, 12(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrw $2, 10(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrw $3, 8(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrw $4, 6(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrw $5, 4(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrw $6, 2(%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: vpinsrw $7, (%eax), %xmm0, %xmm0 -; X86-AVX-NEXT: movzwl 30(%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm1 -; X86-AVX-NEXT: vpinsrw $1, 28(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrw $2, 26(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrw $3, 24(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrw $4, 22(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrw $5, 20(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrw $6, 18(%eax), %xmm1, %xmm1 -; X86-AVX-NEXT: vpinsrw $7, 16(%eax), %xmm1, %xmm1 +; X86-AVX-NEXT: vmovdqu (%eax), %xmm0 +; X86-AVX-NEXT: vmovdqu 16(%eax), %xmm1 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1] +; X86-AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-AVX-NEXT: retl %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 15 diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll index fabca0ea5007e..f9a0bd7f424d6 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -151,33 +151,15 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline define <8 x double> @merge_8f64_f64_76543210(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_76543210: ; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1] -; ALL-NEXT: vmovhps {{.*#+}} xmm2 = xmm3[0,1],mem[0,1] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; ALL-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8f64_f64_76543210: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] -; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] -; X86-AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; X86-AVX512F-NEXT: vpermpd (%eax), %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, ptr %ptr, i64 7 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 6 @@ -288,21 +270,8 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss define <8 x i64> @merge_8i64_i64_76543210(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_76543210: ; ALL: # %bb.0: -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; ALL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0] +; ALL-NEXT: vpermpd (%rdi), %zmm0, %zmm0 ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_8i64_i64_76543210: @@ -466,49 +435,15 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable define <16 x float> @merge_16f32_f32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_FEDCBA9876543210: ; ALL: # %bb.0: -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ALL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; ALL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[2,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16f32_f32_FEDCBA9876543210: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; X86-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] -; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] -; X86-AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] -; X86-AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X86-AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds float, ptr %ptr, i64 15 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 14 @@ -672,49 +607,15 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n define <16 x i32> @merge_16i32_i32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_FEDCBA9876543210: ; ALL: # %bb.0: -; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vpinsrd $1, 8(%rdi), %xmm0, %xmm0 -; ALL-NEXT: vpinsrd $2, 4(%rdi), %xmm0, %xmm0 -; ALL-NEXT: vpinsrd $3, (%rdi), %xmm0, %xmm0 -; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ALL-NEXT: vpinsrd $1, 24(%rdi), %xmm1, %xmm1 -; ALL-NEXT: vpinsrd $2, 20(%rdi), %xmm1, %xmm1 -; ALL-NEXT: vpinsrd $3, 16(%rdi), %xmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; ALL-NEXT: vpinsrd $1, 40(%rdi), %xmm1, %xmm1 -; ALL-NEXT: vpinsrd $2, 36(%rdi), %xmm1, %xmm1 -; ALL-NEXT: vpinsrd $3, 32(%rdi), %xmm1, %xmm1 -; ALL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; ALL-NEXT: vpinsrd $1, 56(%rdi), %xmm2, %xmm2 -; ALL-NEXT: vpinsrd $2, 52(%rdi), %xmm2, %xmm2 -; ALL-NEXT: vpinsrd $3, 48(%rdi), %xmm2, %xmm2 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; ALL-NEXT: retq ; ; X86-AVX512F-LABEL: merge_16i32_i32_FEDCBA9876543210: ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vpinsrd $1, 8(%eax), %xmm0, %xmm0 -; X86-AVX512F-NEXT: vpinsrd $2, 4(%eax), %xmm0, %xmm0 -; X86-AVX512F-NEXT: vpinsrd $3, (%eax), %xmm0, %xmm0 -; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vpinsrd $1, 24(%eax), %xmm1, %xmm1 -; X86-AVX512F-NEXT: vpinsrd $2, 20(%eax), %xmm1, %xmm1 -; X86-AVX512F-NEXT: vpinsrd $3, 16(%eax), %xmm1, %xmm1 -; X86-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vpinsrd $1, 40(%eax), %xmm1, %xmm1 -; X86-AVX512F-NEXT: vpinsrd $2, 36(%eax), %xmm1, %xmm1 -; X86-AVX512F-NEXT: vpinsrd $3, 32(%eax), %xmm1, %xmm1 -; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X86-AVX512F-NEXT: vpinsrd $1, 56(%eax), %xmm2, %xmm2 -; X86-AVX512F-NEXT: vpinsrd $2, 52(%eax), %xmm2, %xmm2 -; X86-AVX512F-NEXT: vpinsrd $3, 48(%eax), %xmm2, %xmm2 -; X86-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; X86-AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1] ; X86-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 15 %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 14