diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0eaba12330e74..6eaac2c93fb1a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35301,6 +35301,20 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, return CanonicalizeShuffleInput(RootVT, V1); } + // See if the shuffle is a hidden identity shuffle - repeated args in HOPs + // etc. can be simplified. + if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) { + SmallVector ScaledMask, IdentityMask; + unsigned NumElts = VT1.getVectorNumElements(); + if (BaseMask.size() <= NumElts && + scaleShuffleElements(BaseMask, NumElts, ScaledMask)) { + for (unsigned i = 0; i != NumElts; ++i) + IdentityMask.push_back(i); + if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2)) + return CanonicalizeShuffleInput(RootVT, V1); + } + } + // Handle 128/256-bit lane shuffles of 512-bit vectors. if (RootVT.is512BitVector() && (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) { diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll index f9ed90e348729..6f3c7828c4e67 100644 --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -26,13 +26,12 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-SLOW-NEXT: movddup {{.*#+}} xmm2 = xmm2[0,0] -; SSSE3-SLOW-NEXT: addps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3 -; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32: @@ -109,13 +108,12 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -136,9 +134,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; @@ -161,9 +158,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> @@ -441,7 +437,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,3,1,1] ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[1],zero -; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1] +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] @@ -493,7 +489,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] -; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]