Skip to content

Commit

Permalink
[NVPTX] Implement min/max in tablegen, rather than with custom DAGCom…
Browse files Browse the repository at this point in the history
…ine logic.

Summary:
This change also lets us use max.{s,u}16.  There's a vague warning in a
test about this maybe being less efficient, but I could not come up with
a case where the resulting SASS (sm_35 or sm_60) was different with or
without max.{s,u}16.  It's true that nvcc seems to emit only
max.{s,u}32, but even ptxas 7.0 seems to have no problem generating
efficient SASS from max.{s,u}16 (the casts up to i32 and back down to
i16 seem to be implicit and nops, happening via register aliasing).

In the absence of evidence, better to have fewer special cases, emit
more straightforward code, etc.  In particular, if a new GPU has 16-bit
min/max instructions, we want to be able to use them.

Reviewers: tra

Subscribers: jholewinski, llvm-commits

Differential Revision: https://reviews.llvm.org/D28732

llvm-svn: 292304
  • Loading branch information
Justin Lebar committed Jan 18, 2017
1 parent 7dc3d6c commit cc938fc
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 77 deletions.
80 changes: 10 additions & 70 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Expand Up @@ -290,15 +290,19 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// Custom handling for i8 intrinsics
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);

setOperationAction(ISD::CTLZ, MVT::i16, Legal);
setOperationAction(ISD::CTLZ, MVT::i32, Legal);
setOperationAction(ISD::CTLZ, MVT::i64, Legal);
for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
setOperationAction(ISD::SMIN, Ty, Legal);
setOperationAction(ISD::SMAX, Ty, Legal);
setOperationAction(ISD::UMIN, Ty, Legal);
setOperationAction(ISD::UMAX, Ty, Legal);

setOperationAction(ISD::CTPOP, Ty, Legal);
setOperationAction(ISD::CTLZ, Ty, Legal);
}

setOperationAction(ISD::CTTZ, MVT::i16, Expand);
setOperationAction(ISD::CTTZ, MVT::i32, Expand);
setOperationAction(ISD::CTTZ, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i16, Legal);
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
setOperationAction(ISD::CTPOP, MVT::i64, Legal);

// PTX does not directly support SELP of i1, so promote to i32 first
setOperationAction(ISD::SELECT, MVT::i1, Custom);
Expand All @@ -313,7 +317,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SREM);
setTargetDAGCombine(ISD::UREM);

Expand Down Expand Up @@ -4159,67 +4162,6 @@ static SDValue PerformANDCombine(SDNode *N,
return SDValue();
}

static SDValue PerformSELECTCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
// Currently this detects patterns for integer min and max and
// lowers them to PTX-specific intrinsics that enable hardware
// support.

const SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC) return SDValue();

const SDValue LHS = Cond.getOperand(0);
const SDValue RHS = Cond.getOperand(1);
const SDValue True = N->getOperand(1);
const SDValue False = N->getOperand(2);
if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
return SDValue();

const EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64) return SDValue();

const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue Larger; // The larger of LHS and RHS when condition is true.
switch (CC) {
case ISD::SETULT:
case ISD::SETULE:
case ISD::SETLT:
case ISD::SETLE:
Larger = RHS;
break;

case ISD::SETGT:
case ISD::SETGE:
case ISD::SETUGT:
case ISD::SETUGE:
Larger = LHS;
break;

default:
return SDValue();
}
const bool IsMax = (Larger == True);
const bool IsSigned = ISD::isSignedIntSetCC(CC);

unsigned IntrinsicId;
if (VT == MVT::i32) {
if (IsSigned)
IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
else
IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
} else {
assert(VT == MVT::i64);
if (IsSigned)
IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
else
IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
}

SDLoc DL(N);
return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
}

static SDValue PerformREMCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
CodeGenOpt::Level OptLevel) {
Expand Down Expand Up @@ -4429,8 +4371,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
return PerformSHLCombine(N, DCI, OptLevel);
case ISD::AND:
return PerformANDCombine(N, DCI);
case ISD::SELECT:
return PerformSELECTCombine(N, DCI);
case ISD::UREM:
case ISD::SREM:
return PerformREMCombine(N, DCI, OptLevel);
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
Expand Up @@ -529,6 +529,12 @@ defm ABS_16 : ABS<Int16Regs, 15, ".s16">;
defm ABS_32 : ABS<Int32Regs, 31, ".s32">;
defm ABS_64 : ABS<Int64Regs, 63, ".s64">;

// Integer min/max.
defm SMAX : I3<"max.s", smax>;
defm UMAX : I3<"max.u", umax>;
defm SMIN : I3<"min.s", smin>;
defm UMIN : I3<"min.u", umin>;

//
// Wide multiplication
//
Expand Down
134 changes: 127 additions & 7 deletions llvm/test/CodeGen/NVPTX/combine-min-max.ll
Expand Up @@ -21,20 +21,140 @@ define i64 @ba_ne_i64(i64 %a, i64 %b) {
ret i64 %sel
}

; PTX does have e.g. max.s16, but at least as of Kepler (sm_3x) that
; gets compiled to SASS that converts the 16 bit parameters to 32 bit
; before using a 32 bit instruction. That is probably not a win and
; NVCC 7.5 does not emit 16 bit min/max either, presumably for that
; reason.
; *************************************
; * All variations with i16

; *** ab, unsigned, i16
define i16 @ab_ugt_i16(i16 %a, i16 %b) {
; LABEL: @ab_ugt_i16
; CHECK-NOT: min
; CHECK-NOT: max
; CHECK: max.u16
%cmp = icmp ugt i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

define i16 @ab_uge_i16(i16 %a, i16 %b) {
; LABEL: @ab_uge_i16
; CHECK: max.u16
%cmp = icmp uge i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

define i16 @ab_ult_i16(i16 %a, i16 %b) {
; LABEL: @ab_ult_i16
; CHECK: min.u16
%cmp = icmp ult i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

define i16 @ab_ule_i16(i16 %a, i16 %b) {
; LABEL: @ab_ule_i16
; CHECK: min.u16
%cmp = icmp ule i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

; *** ab, signed, i16
define i16 @ab_sgt_i16(i16 %a, i16 %b) {
; LABEL: @ab_ugt_i16
; CHECK: max.s16
%cmp = icmp sgt i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

define i16 @ab_sge_i16(i16 %a, i16 %b) {
; LABEL: @ab_sge_i16
; CHECK: max.s16
%cmp = icmp sge i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

define i16 @ab_slt_i16(i16 %a, i16 %b) {
; LABEL: @ab_slt_i16
; CHECK: min.s16
%cmp = icmp slt i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

define i16 @ab_sle_i16(i16 %a, i16 %b) {
; LABEL: @ab_sle_i16
; CHECK: min.s16
%cmp = icmp sle i16 %a, %b
%sel = select i1 %cmp, i16 %a, i16 %b
ret i16 %sel
}

; *** ba, unsigned, i16
define i16 @ba_ugt_i16(i16 %a, i16 %b) {
; LABEL: @ba_ugt_i16
; CHECK: min.u16
%cmp = icmp ugt i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

define i16 @ba_uge_i16(i16 %a, i16 %b) {
; LABEL: @ba_uge_i16
; CHECK: min.u16
%cmp = icmp uge i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

define i16 @ba_ult_i16(i16 %a, i16 %b) {
; LABEL: @ba_ult_i16
; CHECK: max.u16
%cmp = icmp ult i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

define i16 @ba_ule_i16(i16 %a, i16 %b) {
; LABEL: @ba_ule_i16
; CHECK: max.u16
%cmp = icmp ule i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

; *** ba, signed, i16
define i16 @ba_sgt_i16(i16 %a, i16 %b) {
; LBAEL: @ba_ugt_i16
; CHECK: min.s16
%cmp = icmp sgt i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

define i16 @ba_sge_i16(i16 %a, i16 %b) {
; LABEL: @ba_sge_i16
; CHECK: min.s16
%cmp = icmp sge i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

define i16 @ba_slt_i16(i16 %a, i16 %b) {
; LABEL: @ba_slt_i16
; CHECK: max.s16
%cmp = icmp slt i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

define i16 @ba_sle_i16(i16 %a, i16 %b) {
; LABEL: @ba_sle_i16
; CHECK: max.s16
%cmp = icmp sle i16 %a, %b
%sel = select i1 %cmp, i16 %b, i16 %a
ret i16 %sel
}

; *************************************
; * All variations with i32
Expand Down

0 comments on commit cc938fc

Please sign in to comment.