diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index 315e795d7a37c..a5b34c482474f 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST ; Vectorized Pairwise Sum Reductions ; e.g. @@ -954,77 +954,137 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3 ; } define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { -; SSSE3-LABEL: reduction_sum_v4f32_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSSE3-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSSE3-NEXT: addss %xmm4, %xmm5 -; SSSE3-NEXT: movaps %xmm0, %xmm6 -; SSSE3-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1] -; SSSE3-NEXT: addss %xmm5, %xmm6 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSSE3-NEXT: addss %xmm6, %xmm0 -; SSSE3-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: addss %xmm4, %xmm5 -; SSSE3-NEXT: movaps %xmm1, %xmm6 -; SSSE3-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] -; SSSE3-NEXT: addss %xmm5, %xmm6 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; SSSE3-NEXT: addss %xmm6, %xmm1 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: addss %xmm4, %xmm1 -; SSSE3-NEXT: movaps %xmm2, %xmm5 -; SSSE3-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] -; SSSE3-NEXT: addss %xmm1, %xmm5 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; SSSE3-NEXT: addss %xmm5, %xmm2 -; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-NEXT: addss %xmm4, %xmm1 -; SSSE3-NEXT: movaps %xmm3, %xmm4 -; SSSE3-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSSE3-NEXT: addss %xmm1, %xmm4 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] -; SSSE3-NEXT: addss %xmm4, %xmm3 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSSE3-FAST-NEXT: addss %xmm5, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSSE3-FAST-NEXT: addss %xmm5, %xmm1 +; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; SSSE3-FAST-NEXT: addss %xmm4, %xmm2 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] +; SSSE3-FAST-NEXT: addss %xmm4, %xmm3 +; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] +; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX-SLOW-NEXT: retq ; -; AVX-LABEL: reduction_sum_v4f32_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] -; AVX-NEXT: vaddss %xmm6, %xmm4, %xmm4 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm0 -; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0] -; AVX-NEXT: vaddss %xmm6, %xmm4, %xmm4 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX-NEXT: vaddss %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; AVX-NEXT: vaddss %xmm5, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] -; AVX-NEXT: vaddss %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] -; AVX-NEXT: vaddss %xmm5, %xmm1, %xmm1 -; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] -; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] -; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX-NEXT: retq - %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %0) - %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %1) - %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %2) - %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %3) +; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] +; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 +; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX-FAST-NEXT: retq + %5 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) + %6 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) + %7 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2) + %8 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3) %9 = insertelement <4 x float> undef, float %5, i32 0 %10 = insertelement <4 x float> %9, float %6, i32 1 %11 = insertelement <4 x float> %10, float %7, i32 2 @@ -1033,6 +1093,102 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 } declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { +; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 +; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0] +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSSE3-FAST-NEXT: addps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4 +; SSSE3-FAST-NEXT: movaps %xmm2, %xmm0 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSSE3-FAST-NEXT: addps %xmm2, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-FAST-NEXT: addps %xmm3, %xmm1 +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm1 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0] +; SSSE3-FAST-NEXT: movaps %xmm4, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm4, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-SLOW-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm3, %xmm2, %xmm2 +; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] +; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] +; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1 +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0] +; AVX-FAST-NEXT: retq + %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0) + %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1) + %7 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %2) + %8 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %3) + %9 = insertelement <4 x float> undef, float %5, i32 0 + %10 = insertelement <4 x float> %9, float %6, i32 1 + %11 = insertelement <4 x float> %10, float %7, i32 2 + %12 = insertelement <4 x float> %11, float %8, i32 3 + ret <4 x float> %12 +} + define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3) { ; SSSE3-SLOW-LABEL: reduction_sum_v4i32_v4i32: ; SSSE3-SLOW: # %bb.0: