diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5a21982dea408..a2eb28fc7f486 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36814,7 +36814,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, return SDValue(); } -// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). +// Canonicalize SHUFFLE(BINOP(X,C)) -> BINOP(SHUFFLE(X),SHUFFLE(C)). static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, const SDLoc &DL) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -36822,14 +36822,11 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, auto IsMergeableWithShuffle = [](SDValue Op) { // AllZeros/AllOnes constants are freely shuffled and will peek through - // bitcasts. Other constant build vectors do not peek through bitcasts. Only - // merge with target shuffles if it has one use so shuffle combining is - // likely to kick in. + // bitcasts. Other constant build vectors do not peek through bitcasts. return ISD::isBuildVectorAllOnes(Op.getNode()) || ISD::isBuildVectorAllZeros(Op.getNode()) || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || - ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) || - (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()); + ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()); }; auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) { // Ensure we only shuffle whole vector src elements, unless its a logical diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll index c83a7b73edf5a..48d4fe5565554 100644 --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSSE3-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefix=AVX1-FAST ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 define float @pr26491(<4 x float> %a0) { @@ -72,11 +72,11 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) { ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSE2-NEXT: subpd {{.*}}(%rip), %xmm2 ; SSE2-NEXT: movapd %xmm2, %xmm3 -; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: addpd %xmm3, %xmm2 -; SSE2-NEXT: divpd %xmm2, %xmm1 -; SSE2-NEXT: divpd %xmm2, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSE2-NEXT: addpd %xmm2, %xmm3 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0,0] +; SSE2-NEXT: divpd %xmm3, %xmm1 +; SSE2-NEXT: divpd %xmm3, %xmm0 ; SSE2-NEXT: xorpd %xmm2, %xmm2 ; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm2, %xmm1 @@ -87,9 +87,10 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) { ; SSSE3-SLOW-NEXT: movq %rdi, %xmm2 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1] ; SSSE3-SLOW-NEXT: subpd {{.*}}(%rip), %xmm2 -; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm3 = xmm2[0,0] -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movapd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm3[0,0] ; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: divpd %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: xorpd %xmm2, %xmm2 @@ -110,17 +111,31 @@ define <4 x double> @PR41414(i64 %x, <4 x double> %y) { ; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1 ; SSSE3-FAST-NEXT: retq ; -; AVX1-LABEL: PR41414: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovq %rdi, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX1-SLOW-LABEL: PR41414: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovq %rdi, %xmm1 +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-SLOW-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; AVX1-SLOW-NEXT: vaddpd %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-SLOW-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: PR41414: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vmovq %rdi, %xmm1 +; AVX1-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; AVX1-FAST-NEXT: vsubpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-FAST-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq ; ; AVX2-LABEL: PR41414: ; AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 22007df8320a1..282ef37f6e52e 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -364,10 +364,29 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) { ; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 ; SSSE3_FAST-NEXT: retq ; -; AVX-LABEL: hadd_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1_SLOW-LABEL: hadd_v4f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v4f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v4f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v4f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fadd <4 x double> %a0, %a1 @@ -438,10 +457,29 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) { ; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1 ; SSSE3_FAST-NEXT: retq ; -; AVX-LABEL: hsub_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1_SLOW-LABEL: hsub_v4f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v4f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v4f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v4f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fsub <4 x double> %a0, %a1 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index 8a5e1cd663642..68d0584331797 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -470,8 +470,9 @@ define <2 x double> @add_pd_010(<2 x double> %x) { ; SSE-SLOW-LABEL: add_pd_010: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 +; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: add_pd_010: @@ -600,10 +601,10 @@ define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) { define <4 x float> @add_ps_017(<4 x float> %x) { ; SSE-SLOW-LABEL: add_ps_017: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE-SLOW-NEXT: addps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: add_ps_017: @@ -925,10 +926,10 @@ define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { ; SSE-SLOW-LABEL: PR45747_1: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,2] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-SLOW-NEXT: addps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR45747_1: @@ -956,10 +957,9 @@ define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { ; SSE-SLOW-LABEL: PR45747_2: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR45747_2: @@ -1009,14 +1009,14 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) { define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) { ; SSE-SLOW-LABEL: PR34724_add_v4f32_0u23: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm2 -; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE-SLOW-NEXT: addps %xmm2, %xmm0 -; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE-SLOW-NEXT: addps %xmm1, %xmm2 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] +; SSE-SLOW-NEXT: addps %xmm1, %xmm3 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[0,3] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR34724_add_v4f32_0u23: @@ -1026,9 +1026,14 @@ define <4 x float> @PR34724_add_v4f32_0u23(<4 x float> %0, <4 x float> %1) { ; ; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm2 +; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: PR34724_add_v4f32_0u23: diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 156a423970bc9..89434cc4650d0 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -19,20 +19,21 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { ; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm2[0,0] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: @@ -44,11 +45,17 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX1-SLOW-LABEL: pair_sum_v4f32_v4f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1] -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -64,11 +71,17 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX2-SLOW-LABEL: pair_sum_v4f32_v4f32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,1] +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] -; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -105,19 +118,21 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { ; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -129,18 +144,21 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; @@ -153,20 +171,21 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> @@ -199,24 +218,27 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5, <4 x float> %6, <4 x float> %7) { ; SSSE3-SLOW-LABEL: pair_sum_v8f32_v4f32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm8 +; SSSE3-SLOW-NEXT: haddps %xmm0, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: haddps %xmm8, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm8 ; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[3,1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm5[3,1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm8 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm7 ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm6[0,2] -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2] +; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -240,11 +262,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3] -; AVX1-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0 +; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8 +; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm2 ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm3 @@ -290,11 +314,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1] -; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0 +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm8[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2 @@ -385,25 +411,29 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, <4 x i32> %5, <4 x i32> %6, <4 x i32> %7) { ; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm8 +; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm0 +; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm8 ; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] +; SSSE3-SLOW-NEXT: movdqa %xmm8, %xmm1 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[2,0] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,3],xmm2[2,0] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm8 +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0] ; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6 ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm7 ; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2] -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,3],xmm6[0,2] +; SSSE3-SLOW-NEXT: movaps %xmm8, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32: @@ -430,10 +460,13 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1] +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8 +; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3] ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm3 @@ -485,10 +518,13 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; ; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1] +; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm8, %xmm8 +; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] ; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2 @@ -591,67 +627,77 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; SSSE3-SLOW-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 -; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] -; SSSE3-SLOW-NEXT: addps %xmm4, %xmm5 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] -; SSSE3-SLOW-NEXT: addps %xmm5, %xmm1 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm2 +; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSSE3-SLOW-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-SLOW-NEXT: addps %xmm5, %xmm0 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm2 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-SLOW-NEXT: addps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm3 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: sequential_sum_v4f32_v4f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 -; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3] -; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,1] -; SSSE3-FAST-NEXT: addps %xmm4, %xmm5 -; SSSE3-FAST-NEXT: addps %xmm5, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-FAST-NEXT: addps %xmm0, %xmm2 +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm5 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-FAST-NEXT: addps %xmm5, %xmm0 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,2,3] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSSE3-FAST-NEXT: addps %xmm4, %xmm2 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-FAST-NEXT: addps %xmm2, %xmm3 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3] -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,0] -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: addps %xmm4, %xmm3 +; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: sequential_sum_v4f32_v4f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm4 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX-SLOW-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm5, %xmm4 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm4, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; AVX-SLOW-NEXT: vaddps %xmm3, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] @@ -664,15 +710,18 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, < ; AVX-FAST-LABEL: sequential_sum_v4f32_v4f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm4 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,2,3] ; AVX-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-FAST-NEXT: vaddps %xmm4, %xmm5, %xmm4 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero +; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1] -; AVX-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm2[2,3] +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm4, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3] -; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 @@ -716,18 +765,18 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 -; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5 -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm2 +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3 +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm3 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-SLOW-NEXT: retq ; @@ -740,17 +789,17 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4 +; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm2 ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm4 +; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4 ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3 ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm2 -; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3-FAST-NEXT: retq ; @@ -762,20 +811,22 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; @@ -787,18 +838,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1 +; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-FAST-NEXT: retq ; @@ -810,20 +863,22 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm4 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq ; @@ -835,18 +890,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> @@ -1024,28 +1081,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float ; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm1 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 -; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] -; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSSE3-SLOW-NEXT: addps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1072,19 +1129,23 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1],xmm2[1,1] -; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = xmm0[1],xmm1[1],zero,zero -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,0] -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm3, %xmm2, %xmm2 +; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: @@ -1118,23 +1179,22 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3 ; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSSE3-SLOW-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0] -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movdqa %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32: @@ -1157,22 +1217,22 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1] +; AVX-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index 18cd42c8c1de3..a70b25a830ced 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -511,21 +511,39 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_smax: ; X86: # %bb.0: +; X86-NEXT: vpsrad $26, %xmm0, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X86-NEXT: vpsrad $26, %xmm1, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_smax: ; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -551,21 +569,39 @@ declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_smin: ; X86: # %bb.0: +; X86-NEXT: vpsrad $26, %xmm0, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X86-NEXT: vpsrad $26, %xmm1, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_smin: ; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -591,21 +627,39 @@ declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_umax: ; X86: # %bb.0: +; X86-NEXT: vpsrad $26, %xmm0, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X86-NEXT: vpsrad $26, %xmm1, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_umax: ; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -631,21 +685,39 @@ declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) { ; X86-LABEL: signbits_mask_ashr_umin: ; X86: # %bb.0: +; X86-NEXT: vpsrad $26, %xmm0, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm0, %xmm3 ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X86-NEXT: vpsrad $26, %xmm1, %xmm2 +; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X86-NEXT: vpsrad $27, %xmm1, %xmm3 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X86-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-AVX1-LABEL: signbits_mask_ashr_umin: ; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrad $26, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm0, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X64-AVX1-NEXT: vpsrad $26, %xmm1, %xmm2 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X64-AVX1-NEXT: vpsrad $27, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll index d999f5089a21f..ee27ac12739a3 100644 --- a/llvm/test/CodeGen/X86/phaddsub.ll +++ b/llvm/test/CodeGen/X86/phaddsub.ll @@ -412,8 +412,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; SSSE3-SLOW-LABEL: phaddd_single_source5: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: phaddd_single_source5: @@ -425,8 +425,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; AVX-SLOW-LABEL: phaddd_single_source5: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: phaddd_single_source5: @@ -438,8 +438,8 @@ define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { ; AVX2-SHUF-LABEL: phaddd_single_source5: ; AVX2-SHUF: # %bb.0: ; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SHUF-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SHUF-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index abb310d3a518c..3cd9c117c6204 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -185,7 +185,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rsi -; SSE2-NEXT: movq %rsi, %xmm1 +; SSE2-NEXT: movq %rsi, %xmm0 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: imulq %r8 ; SSE2-NEXT: movq %rdx, %rax @@ -193,10 +193,10 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: addq %rdx, %rax ; SSE2-NEXT: leaq (%rax,%rax,8), %rax ; SSE2-NEXT: subq %rax, %rdi -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8589934591,8589934591] -; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [8589934591,8589934591] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movabsq $2049638230412172401, %rdx # imm = 0x1C71C71C71C71C71 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: imulq %rdx @@ -208,13 +208,14 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; SSE2-NEXT: leaq (%rdx,%rdx,8), %rax ; SSE2-NEXT: addq %rcx, %rax ; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3] -; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 7cf566f7b3a1d..83db253d122d4 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -853,13 +853,13 @@ define <2 x i32> @uaddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3] -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3] -; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE-NEXT: pcmpeqd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index dde6832d64822..69d66ebcdb69f 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -899,13 +899,13 @@ define <2 x i32> @usubo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; SSE-NEXT: psubq %xmm1, %xmm0 ; SSE-NEXT: pxor %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,3,3] -; SSE-NEXT: pand %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,3,3] -; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: pcmpgtd %xmm3, %xmm1 +; SSE-NEXT: pcmpeqd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: movdqa %xmm0, (%rdi) ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll index d62462c4e59ac..6adb6b0c2c0b8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -363,8 +363,8 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { ; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; AMD10H: # %bb.0: ; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AMD10H-NEXT: andps {{.*}}(%rip), %xmm0 +; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0 +; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AMD10H-NEXT: packuswb %xmm0, %xmm0 ; AMD10H-NEXT: retq