Skip to content

Commit

Permalink
[X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(…
Browse files Browse the repository at this point in the history
…HOP(X,Y))

An initial backend patch towards fixing the various poor HADD combines (PR34724, PR41813, PR45747 etc.).

This extends isHorizontalBinOp to check if we have per-element horizontal ops (odd+even element pairs), but not in the expected serial order - in which case we build a "post shuffle mask" that we can apply to the HOP result, assuming we have fast-hops/optsize etc.

The next step will be to extend the SHUFFLE(HOP(X,Y)) combines as suggested on PR41813 - accepting more post-shuffle masks even on slow-hop targets if we can fold it into another shuffle.

Differential Revision: https://reviews.llvm.org/D83789

(cherry picked from commit 1821117)
  • Loading branch information
RKSimon authored and zmodem committed Jul 28, 2020
1 parent 592454c commit d687594
Show file tree
Hide file tree
Showing 7 changed files with 321 additions and 409 deletions.
74 changes: 55 additions & 19 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44364,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
/// A horizontal-op B, for some already available A and B, and if so then LHS is
/// set to A, RHS to B, and the routine returns 'true'.
static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool IsCommutative) {
const X86Subtarget &Subtarget, bool IsCommutative,
SmallVectorImpl<int> &PostShuffleMask) {
// If either operand is undef, bail out. The binop should be simplified.
if (LHS.isUndef() || RHS.isUndef())
return false;
Expand Down Expand Up @@ -44458,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
RMask.push_back(i);
}

// Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
(isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
return false;

// If A and B occur in reverse order in RHS, then canonicalize by commuting
// RHS operands and shuffle mask.
if (A != C) {
Expand All @@ -44468,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
if (!(A == C && B == D))
return false;

PostShuffleMask.clear();
PostShuffleMask.append(NumElts, SM_SentinelUndef);

// LHS and RHS are now:
// LHS = shuffle A, B, LMask
// RHS = shuffle A, B, RMask
Expand All @@ -44476,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
// so we just repeat the inner loop if this is a 256-bit op.
unsigned Num128BitChunks = VT.getSizeInBits() / 128;
unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
assert((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane");
for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
Expand All @@ -44487,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
continue;

// Check that successive odd/even elements are being operated on. If not,
// this is not a horizontal operation.
if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
return false;

// Compute the post-shuffle mask index based on where the element
// is stored in the HOP result, and where it needs to be moved to.
int Base = LIdx & ~1u;
int Index = ((Base % NumEltsPer128BitChunk) / 2) +
((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

// The low half of the 128-bit result must choose from A.
// The high half of the 128-bit result must choose from B,
// unless B is undef. In that case, we are always choosing from A.
unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0;

// Check that successive elements are being operated on. If not, this is
// not a horizontal operation.
int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j;
if (!(LIdx == Index && RIdx == Index + 1) &&
!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
return false;
if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
Index += NumEltsPer64BitChunk;
PostShuffleMask[i + j] = Index;
}
}

LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget))
bool IsIdentityPostShuffle =
isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
if (IsIdentityPostShuffle)
PostShuffleMask.clear();

// Assume a SingleSource HOP if we only shuffle one input and don't need to
// shuffle the result.
if (!shouldUseHorizontalOp(LHS == RHS &&
(NumShuffles < 2 || !IsIdentityPostShuffle),
DAG, Subtarget))
return false;

LHS = DAG.getBitcast(VT, LHS);
Expand All @@ -44524,10 +44549,16 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");

// Try to synthesize horizontal add/sub from adds/subs of shuffles.
SmallVector<int, 8> PostShuffleMask;
if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd))
return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) {
SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
if (!PostShuffleMask.empty())
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
DAG.getUNDEF(VT), PostShuffleMask);
return HorizBinOp;
}

// NOTE: isHorizontalBinOp may have changed LHS/RHS variables.

Expand Down Expand Up @@ -47620,17 +47651,22 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG,
bool IsAdd = N->getOpcode() == ISD::ADD;
assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode");

SmallVector<int, 8> PostShuffleMask;
if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 ||
VT == MVT::v8i32) &&
Subtarget.hasSSSE3() &&
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd)) {
isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) {
auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB,
DL, Ops[0].getValueType(), Ops);
return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL,
Ops[0].getValueType(), Ops);
};
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
HOpBuilder);
SDValue HorizBinOp =
SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder);
if (!PostShuffleMask.empty())
HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
DAG.getUNDEF(VT), PostShuffleMask);
return HorizBinOp;
}

return SDValue();
Expand Down
54 changes: 39 additions & 15 deletions llvm/test/CodeGen/X86/haddsub-3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,46 @@ define float @pr26491(<4 x float> %a0) {
; SSE2-NEXT: addss %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: pr26491:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSSE3-NEXT: addps %xmm0, %xmm1
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSSE3-NEXT: addss %xmm1, %xmm0
; SSSE3-NEXT: retq
; SSSE3-SLOW-LABEL: pr26491:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1
; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
; SSSE3-SLOW-NEXT: retq
;
; AVX-LABEL: pr26491:
; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
; SSSE3-FAST-LABEL: pr26491:
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
; SSSE3-FAST-NEXT: addss %xmm0, %xmm1
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: pr26491:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: pr26491:
; AVX1-FAST: # %bb.0:
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX1-FAST-NEXT: retq
;
; AVX2-LABEL: pr26491:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
%2 = fadd <4 x float> %1, %a0
%3 = extractelement <4 x float> %2, i32 2
Expand Down
Loading

0 comments on commit d687594

Please sign in to comment.