diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a2eb28fc7f486d..5a21982dea4085 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36814,7 +36814,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, return SDValue(); } -// Canonicalize SHUFFLE(BINOP(X,C)) -> BINOP(SHUFFLE(X),SHUFFLE(C)). +// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, const SDLoc &DL) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -36822,11 +36822,14 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, auto IsMergeableWithShuffle = [](SDValue Op) { // AllZeros/AllOnes constants are freely shuffled and will peek through - // bitcasts. Other constant build vectors do not peek through bitcasts. + // bitcasts. Other constant build vectors do not peek through bitcasts. Only + // merge with target shuffles if it has one use so shuffle combining is + // likely to kick in. return ISD::isBuildVectorAllOnes(Op.getNode()) || ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || - ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()); + ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) || + (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()); }; auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) { // Ensure we only shuffle whole vector src elements, unless its a logical diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll index 48d4fe5565554d..c83a7b73edf5a4 100644 --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX1,AVX1-FAST ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 define float @pr26491(<4 x float> %a0) { @@ -72,11 +72,11 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) { ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE2-NEXT: subpd {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movapd %xmm2, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: addpd %xmm2, %xmm3 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] -; SSE2-NEXT: divpd %xmm3, %xmm1 -; SSE2-NEXT: divpd %xmm3, %xmm0 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: addpd %xmm3, %xmm2 +; SSE2-NEXT: divpd %xmm2, %xmm1 +; SSE2-NEXT: divpd %xmm2, %xmm0 ; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm2, %xmm1 @@ -87,10 +87,9 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) { ; SSSE3-SLOW-NEXT: movq %rdi, %xmm2 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2 -; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0] +; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm3 = xmm2[0,0] +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2 @@ -111,31 +110,17 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) { ; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: retq ; -; AVX1-SLOW-LABEL: PR41414: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] -; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0 -; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: PR41414: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vmovq %rdi, %xmm1 -; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0 -; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-FAST-NEXT: retq +; AVX1-LABEL: PR41414: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq ; ; AVX2-LABEL: PR41414: ; AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 282ef37f6e52e4..22007df8320a1d 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -364,29 +364,10 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) { ; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 ; SSSE3_FAST-NEXT: retq ; -; AVX1_SLOW-LABEL: hadd_v4f64: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] -; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hadd_v4f64: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hadd_v4f64: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] -; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX2_SLOW-NEXT: retq -; -; AVX2_FAST-LABEL: hadd_v4f64: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hadd_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fadd <4 x double> %a0, %a1 @@ -457,29 +438,10 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) { ; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1 ; SSSE3_FAST-NEXT: retq ; -; AVX1_SLOW-LABEL: hsub_v4f64: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] -; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hsub_v4f64: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hsub_v4f64: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] -; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 -; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX2_SLOW-NEXT: retq -; -; AVX2_FAST-LABEL: hsub_v4f64: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hsub_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fsub <4 x double> %a0, %a1 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index 68d0584331797d..8a5e1cd6636420 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -470,9 +470,8 @@ define <2 x double> @add_pd_010(<2 x double> %x) { ; SSE-SLOW-LABEL: add_pd_010: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: add_pd_010: @@ -601,10 +600,10 @@ define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) { define <4 x float> @add_ps_017(<4 x float> %x) { ; SSE-SLOW-LABEL: add_ps_017: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] -; SSE-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: add_ps_017: @@ -926,10 +925,10 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { ; SSE-SLOW-LABEL: PR45747_1: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE-SLOW-NEXT: addps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2,2,2] -; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR45747_1: @@ -957,9 +956,10 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { ; SSE-SLOW-LABEL: PR45747_2: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR45747_2: @@ -1009,14 +1009,14 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) { define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) { ; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE-SLOW-NEXT: movaps %xmm0, %xmm2 +; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] ; SSE-SLOW-NEXT: addps %xmm2, %xmm0 -; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; SSE-SLOW-NEXT: addps %xmm1, %xmm2 -; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE-SLOW-NEXT: addps %xmm1, %xmm3 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[0,3] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR34724_add_v4f32_0u23: @@ -1026,14 +1026,9 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) { ; ; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2] +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: PR34724_add_v4f32_0u23: diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 89434cc4650d0b..156a423970bc9f 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -19,21 +19,20 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { ; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm2[0,0] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: @@ -45,17 +44,11 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -71,17 +64,11 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -118,21 +105,19 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -144,21 +129,18 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; @@ -171,21 +153,20 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> @@ -218,27 +199,24 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) { ; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm8 -; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm8, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm8 +; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3 ; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm5[3,1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm8 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[3,1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm7 ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2] -; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm6[0,2] +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -262,13 +240,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8 -; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0 ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm2 ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm3 @@ -314,13 +290,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8 -; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3] -; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 @@ -411,29 +385,25 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) { ; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm8 -; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm0 -; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm8 +; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: movdqa %xmm8, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] +; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm2[2,0] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm8 -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[2,0] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6 ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm7 ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2] -; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: @@ -460,13 +430,10 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8 -; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3] ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm3 @@ -518,13 +485,10 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 @@ -627,77 +591,67 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-SLOW-NEXT: addps %xmm5, %xmm0 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm2 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] +; SSSE3-SLOW-NEXT: addps %xmm5, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm2 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: addps %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-FAST-NEXT: addps %xmm5, %xmm0 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm4, %xmm2 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1] +; SSSE3-FAST-NEXT: addps %xmm4, %xmm5 +; SSSE3-FAST-NEXT: addps %xmm5, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm0, %xmm2 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm4, %xmm3 -; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm5, %xmm4 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] +; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm4, %xmm1 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; AVX-SLOW-NEXT: vaddps %xmm3, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] @@ -710,18 +664,15 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-FAST-NEXT: vaddps %xmm4, %xmm5, %xmm4 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] +; AVX-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm4, %xmm1 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] +; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 @@ -765,18 +716,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm4 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm2 -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm3 +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3 -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-SLOW-NEXT: retq ; @@ -789,17 +740,17 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm2 -; SSSE3-FAST-NEXT: paddd %xmm2, %xmm4 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4 +; SSSE3-FAST-NEXT: paddd %xmm2, %xmm4 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3 ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm2 -; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; @@ -811,22 +762,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; @@ -838,20 +787,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1 -; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] +; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-FAST-NEXT: retq ; @@ -863,22 +810,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq ; @@ -890,20 +835,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> @@ -1081,28 +1024,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float ; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1129,23 +1072,19 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] +; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm3, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1179,22 +1118,23 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: @@ -1217,22 +1157,22 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 ; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] -; AVX-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index a70b25a830ced7..18cd42c8c1de33 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -511,39 +511,21 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_smax: ; X86: # %bb.0: -; X86-NEXT: vpsrad $26, %xmm0, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X86-NEXT: vpsrad $26, %xmm1, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_smax: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -569,39 +551,21 @@ declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_smin: ; X86: # %bb.0: -; X86-NEXT: vpsrad $26, %xmm0, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X86-NEXT: vpsrad $26, %xmm1, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_smin: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -627,39 +591,21 @@ declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_umax: ; X86: # %bb.0: -; X86-NEXT: vpsrad $26, %xmm0, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X86-NEXT: vpsrad $26, %xmm1, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_umax: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -685,39 +631,21 @@ declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_umin: ; X86: # %bb.0: -; X86-NEXT: vpsrad $26, %xmm0, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X86-NEXT: vpsrad $26, %xmm1, %xmm2 -; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_umin: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll index ee27ac12739a3b..d999f5089a21fa 100644 --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -412,8 +412,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; SSSE3-SLOW-LABEL: phaddd_single_source5: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: phaddd_single_source5: @@ -425,8 +425,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; AVX-SLOW-LABEL: phaddd_single_source5: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: phaddd_single_source5: @@ -438,8 +438,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; AVX2-SHUF-LABEL: phaddd_single_source5: ; AVX2-SHUF: # %bb.0: ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-SHUF-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index 3cd9c117c62047..abb310d3a518ce 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -185,7 +185,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm0 +; SSE2-NEXT: movq %rsi, %xmm1 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: imulq %r8 ; SSE2-NEXT: movq %rdx, %rax @@ -193,10 +193,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: movq %rdi, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movq %rdi, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rdx @@ -208,14 +208,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 ; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3] +; SSE2-NEXT: andps %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 83db253d122d47..7cf566f7b3a1d9 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -853,13 +853,13 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE-NEXT: pcmpeqd %xmm2, %xmm3 +; SSE-NEXT: pcmpgtd %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3] +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3] +; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index 69d66ebcdb69fd..dde6832d648221 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -899,13 +899,13 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; SSE-NEXT: psubq %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3] +; SSE-NEXT: pand %xmm3, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3] +; SSE-NEXT: por %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll index 6adb6b0c2c0b86..d62462c4e59ac0 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -363,8 +363,8 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { ; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; AMD10H: # %bb.0: ; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0 -; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AMD10H-NEXT: andps {{.*}}(%rip), %xmm0 ; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AMD10H-NEXT: packuswb %xmm0, %xmm0 ; AMD10H-NEXT: retq