Skip to content

Commit

Permalink
[X86][SSE] Use (V)PHMINPOSUW for vXi16 SMAX/SMIN/UMAX/UMIN horizontal…
Browse files Browse the repository at this point in the history
… reductions (PR32841)

(V)PHMINPOSUW determines the UMIN element in an v8i16 input, with suitable bit flipping it can also be used for SMAX/SMIN/UMAX cases as well.

This patch matches vXi16 SMAX/SMIN/UMAX/UMIN horizontal reductions and reduces the input down to a v8i16 vector before calling (V)PHMINPOSUW.

A later patch will use this for v16i8 reductions as well (PR32841).

Differential Revision: https://reviews.llvm.org/D39729

llvm-svn: 318917
  • Loading branch information
RKSimon committed Nov 23, 2017
1 parent a3251bf commit 90accbc
Show file tree
Hide file tree
Showing 9 changed files with 599 additions and 579 deletions.
65 changes: 65 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25073,6 +25073,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
case X86ISD::ADD: return "X86ISD::ADD";
case X86ISD::SUB: return "X86ISD::SUB";
case X86ISD::ADC: return "X86ISD::ADC";
Expand Down Expand Up @@ -30326,6 +30327,66 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
}

// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// Bail without SSE41.
if (!Subtarget.hasSSE41())
return SDValue();

EVT ExtractVT = Extract->getValueType(0);
if (ExtractVT != MVT::i16)
return SDValue();

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
unsigned BinOp;
SDValue Src = matchBinOpReduction(
Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
if (!Src)
return SDValue();

EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getScalarType();
if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
return SDValue();

SDLoc DL(Extract);
SDValue MinPos = Src;

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
while (SrcVT.getSizeInBits() > 128) {
unsigned NumElts = SrcVT.getVectorNumElements();
unsigned NumSubElts = NumElts / 2;
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
unsigned SubSizeInBits = SrcVT.getSizeInBits();
SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
}
assert(SrcVT == MVT::v8i16 && "Unexpected value type");

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
// to flip the value accordingly.
SDValue Mask;
if (BinOp == ISD::SMAX)
Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
else if (BinOp == ISD::SMIN)
Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
else if (BinOp == ISD::UMAX)
Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);

if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);

if (Mask)
MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
DAG.getIntPtrConstant(0, DL));
}

// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
Expand Down Expand Up @@ -30633,6 +30694,10 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
return Cmp;

// Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;

// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
if (SrcVT != MVT::v4i32)
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,9 @@ namespace llvm {
// Vector integer comparisons, the result is in a mask vector.
PCMPEQM, PCMPGTM,

// v8i16 Horizontal minimum and position.
PHMINPOS,

MULTISHIFT,

/// Vector comparison generating mask bits for fp and
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,9 @@ def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;

def X86phminpos: SDNode<"X86ISD::PHMINPOS",
SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>;

def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisVec<2>, SDTCisInt<0>,
SDTCisInt<1>]>;
Expand Down
12 changes: 5 additions & 7 deletions llvm/lib/Target/X86/X86InstrSSE.td
Original file line number Diff line number Diff line change
Expand Up @@ -6188,33 +6188,31 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
Sched<[WriteFAddLd]>, XS;
}



// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
Intrinsic IntId128, PatFrag ld_frag,
SDNode OpNode, PatFrag ld_frag,
X86FoldableSchedWrite Sched> {
def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (IntId128 VR128:$src))]>,
[(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
Sched<[Sched]>;
def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(IntId128 (bitconvert (ld_frag addr:$src))))]>,
(v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
Sched<[Sched.Folded]>;
}

// PHMIN has the same profile as PSAD, thus we use the same scheduling
// model, although the naming is misleading.
let Predicates = [HasAVX] in
defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
int_x86_sse41_phminposuw, loadv2i64,
X86phminpos, loadv2i64,
WriteVecIMul>, VEX, VEX_WIG;
defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
int_x86_sse41_phminposuw, memopv2i64,
X86phminpos, memopv2i64,
WriteVecIMul>;

/// SS48I_binop_rm - Simple SSE41 binary operator.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/X86/X86IntrinsicsInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1679,6 +1679,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
Expand Down

0 comments on commit 90accbc

Please sign in to comment.