From 2655bd51d6a350b1aa71566fa9cbaad64990336a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 11 Aug 2020 18:06:00 +0100 Subject: [PATCH] [X86][SSE] combineShuffleWithHorizOp - canonicalize SHUFFLE(HOP(X,Y),HOP(Y,X)) -> SHUFFLE(HOP(X,Y)) Attempt to canonicalize binary shuffles of HOPs with commuted operands to an unary shuffle. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 30 ++++++++++++++++++++++++- llvm/test/CodeGen/X86/haddsub-shuf.ll | 8 ------- llvm/test/CodeGen/X86/haddsub-undef.ll | 2 -- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 210d04be7c36e3..9ca4d3386960f4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35886,12 +35886,40 @@ static SDValue combineShuffleWithHorizOp(SDValue N, MVT VT, const SDLoc &DL, if (!isHoriz && !isPack) return SDValue(); - // Canonicalize unary horizontal ops to only refer to lower halves. if (TargetMask.size() == VT0.getVectorNumElements()) { int NumElts = VT0.getVectorNumElements(); int NumLanes = VT0.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; int NumHalfEltsPerLane = NumEltsPerLane / 2; + + // Canonicalize binary shuffles of horizontal ops that use the + // same sources to an unary shuffle. + // TODO: Try to perform this fold even if the shuffle remains. + if (BC0 != BC1) { + auto ContainsOps = [](SDValue HOp, SDValue Op) { + return Op == HOp.getOperand(0) || Op == HOp.getOperand(1); + }; + // Commute if all BC0's ops are contained in BC1. + if (ContainsOps(BC1, BC0.getOperand(0)) && + ContainsOps(BC1, BC0.getOperand(1))) { + ShuffleVectorSDNode::commuteMask(TargetMask); + std::swap(BC0, BC1); + } + // If BC1 can be represented by BC0, then convert to unary shuffle. + if (ContainsOps(BC0, BC1.getOperand(0)) && + ContainsOps(BC0, BC1.getOperand(1))) { + for (int &M : TargetMask) { + if (M < NumElts) // BC0 element or UNDEF/Zero sentinel. + continue; + int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0; + M -= NumElts + (SubLane * NumHalfEltsPerLane); + if (BC1.getOperand(SubLane) != BC0.getOperand(0)) + M += NumHalfEltsPerLane; + } + } + } + + // Canonicalize unary horizontal ops to only refer to lower halves. for (int i = 0; i != NumElts; ++i) { int &M = TargetMask[i]; if (isUndefOrZero(M)) diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index d7bacfe04be88a..4a2f8bd0c8b27a 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -910,8 +910,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { ; AVX1_FAST: # %bb.0: ; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX1_FAST-NEXT: retq ; @@ -929,8 +927,6 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { ; AVX2_FAST: # %bb.0: ; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> @@ -972,8 +968,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { ; AVX1_FAST-LABEL: PR34724_2: ; AVX1_FAST: # %bb.0: ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_FAST-NEXT: retq ; ; AVX2_SLOW-LABEL: PR34724_2: @@ -987,8 +981,6 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { ; AVX2_FAST-LABEL: PR34724_2: ; AVX2_FAST: # %bb.0: ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index f950d0b6a723dd..f8379e629c0245 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -986,8 +986,6 @@ define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) { ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX-FAST-NEXT: retq %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32>