166 changes: 133 additions & 33 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasBFE())
setHasExtractBitsInsn(true);

setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);


// These are really only legal for ieee_mode functions. We should be avoiding
// them for functions that don't have ieee_mode enabled, so just say they are
// legal.
setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);


if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
Expand Down Expand Up @@ -474,8 +486,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP2 Actions.
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::f16, Legal);

setOperationAction(ISD::FDIV, MVT::f16, Custom);

// F16 - VOP3 Actions.
Expand Down Expand Up @@ -558,6 +569,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// This isn't really legal, but this avoids the legalizer unrolling it (and
// allows matching fneg (fabs x) patterns)
setOperationAction(ISD::FABS, MVT::v2f16, Legal);

setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);

setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);

setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
}

if (Subtarget->hasVOP3PInsts()) {
Expand All @@ -575,8 +597,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);

setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);

setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
Expand All @@ -596,6 +620,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);

setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);

setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
Expand Down Expand Up @@ -634,6 +662,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::FMINNUM_IEEE);
setTargetDAGCombine(ISD::FMAXNUM_IEEE);
setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
Expand Down Expand Up @@ -3580,6 +3610,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FNEG:
case ISD::FCANONICALIZE:
return splitUnaryVectorOp(Op, DAG);
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
Expand All @@ -3590,10 +3623,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SMAX:
case ISD::UMIN:
case ISD::UMAX:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case ISD::FADD:
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
return splitBinaryVectorOp(Op, DAG);
}
return SDValue();
Expand Down Expand Up @@ -4048,6 +4081,23 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
}

SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());

// FIXME: Assert during eslection that this is only selected for
// ieee_mode. Currently a combine can produce the ieee version for non-ieee
// mode functions, but this happens to be OK since it's only done in cases
// where there is known no sNaN.
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);

if (VT == MVT::v4f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}

SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
Expand Down Expand Up @@ -7521,37 +7571,32 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,

case ISD::FMINNUM:
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case AMDGPUISD::CLAMP:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMIN3: {
// FIXME: Shouldn't treat the generic operations different based these.
bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
if (IsIEEEMode) {
// snans will be quieted, so we only need to worry about denormals.
if (Subtarget->supportsMinMaxDenormModes() ||
denormalsEnabledForType(Op.getValueType()))
return true;

// Flushing may be required.
// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
// targets need to check their input recursively.
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
}
// However, we aren't really required to flush the result from
// minnum/maxnum..

// snans will be quieted, so we only need to worry about denormals.
if (Subtarget->supportsMinMaxDenormModes() ||
denormalsEnabledForType(Op.getValueType())) {
// Only quieting may be necessary.
return DAG.isKnownNeverSNaN(Op.getOperand(0)) &&
DAG.isKnownNeverSNaN(Op.getOperand(1));
denormalsEnabledForType(Op.getValueType()))
return true;

// Flushing may be required.
// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
// targets need to check their input recursively.

// FIXME: Does this apply with clamp? It's implemented with max.
for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
return false;
}

// Flushing and quieting may be necessary
// With ieee_mode off, the nan is returned as-is, so if it is an sNaN it
// needs to be quieted.
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
return true;
}
case ISD::SELECT: {
return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
Expand All @@ -7578,6 +7623,21 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// Could be anything.
return false;

case ISD::BITCAST: {
// Hack round the mess we make when legalizing extract_vector_elt
SDValue Src = Op.getOperand(0);
if (Src.getValueType() == MVT::i16 &&
Src.getOpcode() == ISD::TRUNCATE) {
SDValue TruncSrc = Src.getOperand(0);
if (TruncSrc.getValueType() == MVT::i32 &&
TruncSrc.getOpcode() == ISD::BITCAST &&
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
}
}

return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntrinsicID
= cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
Expand All @@ -7603,7 +7663,6 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
}

// Constant fold canonicalize.

SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
// Flush denormals to 0 if not enabled.
Expand Down Expand Up @@ -7699,18 +7758,40 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
}
}

unsigned SrcOpc = N0.getOpcode();

// If it's free to do so, push canonicalizes further up the source, which may
// find a canonical source.
//
// TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
// sNaNs.
if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
if (CRHS && N0.hasOneUse()) {
SDLoc SL(N);
SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
N0.getOperand(0));
SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
DCI.AddToWorklist(Canon0.getNode());

return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
}
}

return isCanonicalized(DAG, N0) ? N0 : SDValue();
}

static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
return AMDGPUISD::UMAX3;
case ISD::FMINNUM:
case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
Expand Down Expand Up @@ -7877,6 +7958,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,

// fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
(Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
(Opc == AMDGPUISD::FMIN_LEGACY &&
Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
(VT == MVT::f32 || VT == MVT::f64 ||
Expand Down Expand Up @@ -7995,7 +8077,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
case ISD::SMIN:
case ISD::SMAX:
case ISD::FMAXNUM:
case ISD::FMINNUM: {
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE: {
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
Vec.getOperand(0), Idx);
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
Expand Down Expand Up @@ -8595,13 +8679,15 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performSetCCCombine(N, DCI);
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
case ISD::UMIN:
case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY: {
if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
getTargetMachine().getOptLevel() > CodeGenOpt::None)
return performMinMaxCombine(N, DCI);
break;
Expand Down Expand Up @@ -9320,3 +9406,17 @@ bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
return false;
}
}

bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
unsigned Depth) const {
if (Op.getOpcode() == AMDGPUISD::CLAMP) {
if (Subtarget->enableDX10Clamp())
return true; // Clamped to 0.
return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
}

return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
SNaN, Depth);
}
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {

/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;

SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;
Expand Down Expand Up @@ -346,6 +347,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
bool denormalsEnabledForType(EVT VT) const;

bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN = false,
unsigned Depth = 0) const override;
};

} // End namespace llvm
Expand Down
13 changes: 7 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1645,21 +1645,22 @@ def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
// This matches 16 permutations of
// max(min(x, y), min(max(x, y), z))
class FPMed3Pat<ValueType vt,
//SDPatternOperator max, SDPatternOperator min,
Instruction med3Inst> : GCNPat<
(fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
(med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
>;

class FP16Med3Pat<ValueType vt,
Instruction med3Inst> : GCNPat<
(fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
(VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
(vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
(med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)
>;
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -393,8 +393,8 @@ defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>,
defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmul_u24>;
defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smin>;
defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, smax>;
defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_PAT_GEN<VOP_I32_I32_I32>, umin>;
Expand Down Expand Up @@ -556,8 +556,8 @@ defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,8 @@ let SchedRW = [WriteDoubleAdd] in {
def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum, 1>;
def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum, 1>;
def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]

let SchedRW = [WriteQuarterRate32] in {
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_

def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;

def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
Expand Down
20 changes: 15 additions & 5 deletions llvm/test/CodeGen/AMDGPU/clamp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float a

; GCN-LABEL: {{^}}v_clamp_negzero_maybe_snan_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[A]]
; GCN: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[A]]
; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0x80000000, [[QUIET]]
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
Expand All @@ -90,8 +91,17 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(float addrspace(1)* %o

; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[QUIET_A]]
; GCN: v_min_f32_e32 [[MED:v[0-9]+]], 1.0, [[QUIET_A]]
; GCN-NOT: [[MAX]]
; GCN-NOT: [[MED]]

; SI: buffer_store_dword [[MED]]
; SI: buffer_store_dword [[MAX]]

; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]]
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]]
define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
Expand Down Expand Up @@ -406,8 +416,8 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out,

; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
; GCN: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[A]]
; GCN: v_med3_f32 {{v[0-9]+}}, [[QUIET_A]], 0, 1.0
define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
Expand Down
77 changes: 47 additions & 30 deletions llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
Original file line number Diff line number Diff line change
Expand Up @@ -455,14 +455,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace
}

; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
; GFX9-NOT: v_max
; GFX9-NOT: v_mul

; VI-DENORM-NOT: v_max_f32
; VI-DENORM-NOT: v_mul_f32
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]

; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; GCN-NOT: v_max
; GCN-NOT: v_mul

; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
Expand All @@ -476,15 +475,13 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
}

; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}

; GFX9-NOT: v_max
; GFX9-NOT: v_mul


; VI-DENORM-NOT: v_max
; VI-DENORM-NOT: v_mul
; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; GCN-DENORM-NOT: v_max
; GCN-DENORM-NOT: v_mul

; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN-DENORM-NOT: v_max
; GCN-DENORM-NOT: v_mul

; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
Expand Down Expand Up @@ -530,13 +527,19 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace
}

; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
; GFX9: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]

; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]

; VI-FLUSH: v_min_f32_e32 [[V0:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]

; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, v{{[0-9]+}}

; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]

; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]

; GCN-NOT: v_mul
; GCN-NOT: v_max
Expand All @@ -552,11 +555,14 @@ define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspa
}

; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
; VI-FLUSH: v_max_f32_e32 [[V0:v[0-9]+]], 0, v{{[0-9]+}}
; VI-FLUSH: v_mul_f32_e32 [[RESULT:v[0-9]+]], 1.0, [[V0]]
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]

; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]

; VI-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
; VI-FLUSH: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]

; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, v{{[0-9]+}}
; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]

; GCN-NOT: v_mul
; GCN-NOT: v_max
Expand Down Expand Up @@ -707,16 +713,21 @@ define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspa

; Need to quiet the nan with a separate instruction since it will be
; passed through the minnum.
; FIXME: canonicalize doens't work correctly without ieee_mode

; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
; GFX9-NOT: v0
; GFX9-NOT: v1
; GFX9: v_min_f32_e32 v0, v0, v1
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX9-DENORM-NEXT: v_max_f32_e32 v0, v0, v0
; GFX9-NEXT: ; return to shader

; VI: v_min_f32_e32 v0, v0, v1
; VI-FLUSH: v_mul_f32_e32 v0, 1.0, v0
; VI-DENORM: v_max_f32_e32 v0, v0, v0
; VI-FLUSH: v_min_f32_e32 v0, v0, v1
; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
; VI-FLUSH-NEXT: ; return

; VI-DENORM-NOT: v0
; VI-DENORM: v_min_f32_e32 v0, v0, v1
; VI-DENORM-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
Expand All @@ -727,8 +738,14 @@ define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %
; GFX9: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64

; VI: v_min_f32_e32 v0, v0, v1
; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
; VI-FLUSH: v_min_f32_e32 v0, v0, v1

; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
; VI-DENORM: v_min_f32_e32 v0, v0, v1

; VI-NEXT: s_setpc_b64
define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
Expand Down
11 changes: 7 additions & 4 deletions llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
declare double @llvm.maxnum.f64(double, double) nounwind readnone

; SI-LABEL: {{^}}test_fmax3_f64:
; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
; SI: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
; SI: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
; SI: v_max_f64 [[QUIET_A:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGA]]
; SI: v_max_f64 [[QUIET_B:v\[[0-9]+:[0-9]+\]]], [[REGB]], [[REGB]]
; SI: v_max_f64 [[MAX0:v\[[0-9]+:[0-9]+\]]], [[QUIET_A]], [[QUIET_B]]
; SI: v_max_f64 [[QUIET_C:v\[[0-9]+:[0-9]+\]]], [[REGC]], [[REGC]]
; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[MAX0]], [[QUIET_C]]
; SI: buffer_store_dwordx2 [[RESULT]],
; SI: s_endpgm
define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
Expand Down
44 changes: 27 additions & 17 deletions llvm/test/CodeGen/AMDGPU/fmax3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,11 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float
; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]]
; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]

; VI: v_max_f16_e32
; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]]

; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
; GCN: buffer_store_short [[RESULT]],
Expand All @@ -75,8 +78,11 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half ad
; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]]
; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]]

; VI: v_max_f16_e32
; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]]
; VI-DAG: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]]
; VI: v_max_f16_e32 [[MAX0:v[0-9]+]], [[QUIET_A]], [[QUIET_B]]
; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]]
; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]]

; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
; GCN: buffer_store_short [[RESULT]],
Expand All @@ -100,22 +106,25 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half ad
; SI-NEXT: v_max3_f32
; SI-NEXT: v_max3_f32

; VI: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_max_f16_e32 v0, v0, v1
; VI: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI: v_max_f16_e32 v0, v2, v0
; VI: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI: v_max_f16_e32 v0, v0, v3
; VI: v_or_b32_e32 v0, v0, v1

; GFX9: v_pk_max_f16
; VI: s_waitcnt
; VI-NEXT: v_max_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_max_f16_e32 v0, v2, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64

; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_max_f16
; GFX9-NEXT: v_pk_max_f16
; GFX9-NEXT: v_pk_max_f16
define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
entry:
%max = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
%res = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
%max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
%max1 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
%res = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
ret <2 x half> %res
}

Expand All @@ -126,3 +135,4 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind "no-nans-fp-math"="true" }
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NNAN-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
Expand Down Expand Up @@ -178,7 +178,7 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; VI-NNAN-NEXT: v_max_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
Expand Down Expand Up @@ -283,8 +283,8 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_max_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
Expand Down Expand Up @@ -437,10 +437,10 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; VI-NNAN-NEXT: v_max_f16_e32 v1, v1, v5
; VI-NNAN-NEXT: v_max_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_max_f16_e32 v0, v0, v4
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10
; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9
; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
Expand Down
97 changes: 72 additions & 25 deletions llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=FUNC %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s

; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s

; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s

declare i32 @llvm.r600.read.tidig.x() #1

; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]

; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]

; EG: MAX
Expand All @@ -26,12 +35,16 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmax_legacy_uge_f32_nnan_src:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-DAG: v_add_f32_e32 [[ADD_A:v[0-9]+]], 1.0, [[A]]
; GCN-DAG: v_add_f32_e32 [[ADD_B:v[0-9]+]], 2.0, [[B]]

; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]

; VI-SAFE: v_cmp_nlt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[ADD_B]], [[ADD_A]]

; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]

; EG: MAX
Expand All @@ -52,9 +65,14 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f32_nnan_src(float addrspace(1)*
}

; FUNC-LABEL: {{^}}test_fmax_legacy_oge_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]

; VI-SAFE: v_cmp_ge_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Expand All @@ -72,9 +90,15 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmax_legacy_ugt_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]

; VI-SAFE: v_cmp_nle_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]


; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Expand All @@ -92,9 +116,14 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]

; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
Expand All @@ -112,9 +141,15 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v1f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]

; VI-SAFE: v_cmp_gt_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]


; GCN-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; EG: MAX
define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
Expand All @@ -132,12 +167,24 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)*
}

; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_v3f32:
; GCN-SAFE: v_max_legacy_f32_e32
; GCN-SAFE: v_max_legacy_f32_e32
; GCN-SAFE: v_max_legacy_f32_e32
; SI-SAFE: v_max_legacy_f32_e32
; SI-SAFE: v_max_legacy_f32_e32
; SI-SAFE: v_max_legacy_f32_e32

; VI-SAFE: v_cmp_gt_f32_e32
; VI-SAFE: v_cndmask_b32_e32
; VI-SAFE: v_cmp_gt_f32_e32
; VI-SAFE: v_cndmask_b32_e32
; VI-SAFE: v_cmp_gt_f32_e32
; VI-SAFE: v_cndmask_b32_e32
; VI-SAFE-NOT: v_cmp
; VI-SAFE-NOT: v_cndmask

; GCN-NONAN: v_max_f32_e32
; GCN-NONAN: v_max_f32_e32
; GCN-NONAN: v_max_f32_e32

; GCN-NOT: v_max
define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
Expand All @@ -153,8 +200,8 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)*
}

; FUNC-LABEL: {{^}}test_fmax_legacy_ogt_f32_multi_use:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-NOT: v_max_
; GCN: v_cmp_gt_f32
; GCN-NEXT: v_cndmask_b32
Expand Down
56 changes: 32 additions & 24 deletions llvm/test/CodeGen/AMDGPU/fmaxnum.ll
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s

; GCN-LABEL: {{^}}test_fmax_f32:
; GCN: v_max_f32_e32
define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 {
%val = call float @llvm.maxnum.f32(float %a, float %b)
; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on:
; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @test_fmax_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
%val = call float @llvm.maxnum.f32(float %a, float %b) #1
store float %val, float addrspace(1)* %out, align 4
ret void
}

; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_off:
; GCN: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: ; return
define amdgpu_ps float @test_fmax_f32_ieee_mode_off(float %a, float %b) #0 {
%val = call float @llvm.maxnum.f32(float %a, float %b) #1
ret float %val
}

; GCN-LABEL: {{^}}test_fmax_v2f32:
; GCN: v_max_f32_e32
; GCN: v_max_f32_e32
Expand Down Expand Up @@ -158,38 +170,34 @@ define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fmax_var_immediate_f32:
; GCN-LABEL: {{^}}fmax_var_immediate_f32_no_ieee:
; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.maxnum.f32(float %a, float 2.0)
store float %val, float addrspace(1)* %out, align 4
ret void
define amdgpu_ps float @fmax_var_immediate_f32_no_ieee(float inreg %a) #0 {
%val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
ret float %val
}

; GCN-LABEL: {{^}}fmax_immediate_var_f32:
; GCN-LABEL: {{^}}fmax_immediate_var_f32_no_ieee:
; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.maxnum.f32(float 2.0, float %a)
store float %val, float addrspace(1)* %out, align 4
ret void
define amdgpu_ps float @fmax_immediate_var_f32_no_ieee(float inreg %a) #0 {
%val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
ret float %val
}

; GCN-LABEL: {{^}}fmax_var_literal_f32:
; GCN-LABEL: {{^}}fmax_var_literal_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.maxnum.f32(float %a, float 99.0)
store float %val, float addrspace(1)* %out, align 4
ret void
define amdgpu_ps float @fmax_var_literal_f32_no_ieee(float inreg %a) #0 {
%val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
ret float %val
}

; GCN-LABEL: {{^}}fmax_literal_var_f32:
; GCN-LABEL: {{^}}fmax_literal_var_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.maxnum.f32(float 99.0, float %a)
store float %val, float addrspace(1)* %out, align 4
ret void
define amdgpu_ps float @fmax_literal_var_f32_no_ieee(float inreg %a) #0 {
%val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
ret float %val
}

; GCN-LABEL: {{^}}test_func_fmax_v3f32:
Expand Down
35 changes: 20 additions & 15 deletions llvm/test/CodeGen/AMDGPU/fmin3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -95,22 +95,26 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half ad
; SI-NEXT: v_min3_f32
; SI-NEXT: v_min3_f32

; VI: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_min_f16_e32 v0, v0, v1
; VI: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI: v_min_f16_e32 v0, v2, v0
; VI: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI: v_min_f16_e32 v0, v0, v3
; VI: v_or_b32_e32 v0, v0, v1

; GFX9: v_pk_min_f16
; GFX9: v_pk_min_f16
; GFX9: v_pk_min_f16
define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
; VI: s_waitcnt
; VI-NEXT: v_min_f16_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: v_min_f16_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_min_f16_e32 v0, v2, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_min_f16_e32 v0, v0, v3
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64

; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_min_f16 v0, v0, v1
; GFX9-NEXT: v_pk_min_f16 v0, v2, v0
; GFX9-NEXT: v_pk_min_f16 v0, v0, v3
; GFX9-NEXT: s_setpc_b64
define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #2 {
entry:
%min = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
%min1 = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
%res = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
%min = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
%min1 = call <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
%res = call <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
ret <2 x half> %res
}

Expand All @@ -121,3 +125,4 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nounwind "no-nans-fp-math"="true" }
49 changes: 40 additions & 9 deletions llvm/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-SAFE,GCN,SI %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN,SI %s

; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN,VI %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN,VI %s

; GCN-LABEL: {{^}}min_fneg_select_regression_0:
; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0
; GCN-NOT: v_mul

; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0

; VI-SAFE: v_cmp_nle_f32_e32 vcc, 1.0, v0
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0

; GCN-NONAN: v_max_f32_e64 v0, -v0, -1.0
define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ult float %a, 1.0
Expand All @@ -12,7 +22,14 @@ define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
}

; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
; GCN-NOT: v_mul

; SI-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0

; VI-SAFE: v_cmp_nle_f32_e32 vcc, -1.0, v0
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0

; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
%fneg.a = fsub float -0.0, %a
Expand All @@ -22,19 +39,33 @@ define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0
}

; GCN-LABEL: {{^}}max_fneg_select_regression_0:
; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
; GCN-NOT: v_mul

; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0

; VI-SAFE: v_cmp_nge_f32_e32 vcc, 1.0, v0
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0

; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
define amdgpu_ps float @max_fneg_select_regression_0(float %a) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ugt float %a, 1.0
%min.a = select i1 %cmp.a, float %fneg.a, float -1.0
ret float %min.a
}

; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
; GCN-NOT: v_mul

; SI-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0

; VI-SAFE: v_cmp_nge_f32_e32 vcc, -1.0, v0
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, -1.0, v0, vcc
; VI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0

; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 {
define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a) #0 {
%fneg.a = fsub float -0.0, %a
%cmp.a = fcmp ugt float %a, -1.0
%min.a = select i1 %cmp.a, float %fneg.a, float 1.0
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; VI-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NNAN-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
Expand Down Expand Up @@ -179,7 +179,7 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; VI-NNAN-NEXT: v_min_f16_sdwa v4, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v4
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
Expand Down Expand Up @@ -284,8 +284,8 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v3
; VI-NNAN-NEXT: v_min_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v2
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v5
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
Expand Down Expand Up @@ -438,10 +438,10 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; VI-NNAN-NEXT: v_min_f16_e32 v1, v1, v5
; VI-NNAN-NEXT: v_min_f16_sdwa v11, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NNAN-NEXT: v_min_f16_e32 v0, v0, v4
; VI-NNAN-NEXT: v_or_b32_e32 v0, v0, v11
; VI-NNAN-NEXT: v_or_b32_e32 v1, v1, v10
; VI-NNAN-NEXT: v_or_b32_e32 v2, v2, v9
; VI-NNAN-NEXT: v_or_b32_e32 v3, v3, v8
; VI-NNAN-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
; SI-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
Expand Down
126 changes: 93 additions & 33 deletions llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN-NONAN -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN-SAFE,SI-SAFE,GCN,FUNC %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NONAN,GCN-NONAN,GCN,FUNC %s

; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-SAFE,GCN-SAFE,GCN,FUNC %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI-NONAN,GCN-NONAN,GCN,FUNC %s

; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s

declare i32 @llvm.r600.read.tidig.x() #1
Expand All @@ -10,8 +14,13 @@ declare i32 @llvm.r600.read.tidig.x() #1

; FUNC-LABEL: {{^}}s_test_fmin_legacy_subreg_inputs_f32:
; EG: MIN *
; GCN-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
; GCN-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}

; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}

; VI-SAFE: v_cmp_nlt_f32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}

; VI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(1)* %out, <4 x float> %reg0) #0 {
%r0 = extractelement <4 x float> %reg0, i32 0
%r1 = extractelement <4 x float> %reg0, i32 1
Expand All @@ -22,27 +31,37 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(float addrspace(
}

; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; GCN-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}

; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]

; GCN-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]

; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]]
; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]]

; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
%cmp = fcmp ule float %a, %b
%val = select i1 %cmp, float %a, float %b
store float %val, float addrspace(1)* %out, align 4
ret void
}

; Nsz also needed
; FIXME: Should separate tests
; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src:
; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; GCN: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}

; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0
; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0

; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]]

; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[ADD_A]], [[ADD_B]]
; VI-SAFE: v_cndmask_b32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]], vcc

; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[ADD_A]], [[ADD_B]]
define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1)* %out, float %a, float %b) #0 {
%a.nnan = fadd nnan float %a, 1.0
Expand All @@ -54,9 +73,14 @@ define amdgpu_kernel void @s_test_fmin_legacy_ule_f32_nnan_src(float addrspace(1
}

; FUNC-LABEL: {{^}}test_fmin_legacy_ule_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]

; VI-SAFE: v_cmp_ngt_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
Expand All @@ -73,9 +97,14 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]

; VI-SAFE v_cmp_le_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
Expand All @@ -92,9 +121,14 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmin_legacy_olt_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]

; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
Expand All @@ -111,9 +145,14 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmin_legacy_ult_f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]

; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
Expand All @@ -130,9 +169,14 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, fl
}

; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v1f32:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]

; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]

; VI-SAFE v_cmp_lt_f32_e32 vcc, [[A]], [[B]]
; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]

; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
Expand All @@ -149,10 +193,15 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)*
}

; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v2f32:
; GCN: buffer_load_dwordx2
; GCN: buffer_load_dwordx2
; GCN-SAFE: v_min_legacy_f32_e32
; GCN-SAFE: v_min_legacy_f32_e32
; GCN: {{buffer|flat}}_load_dwordx2
; GCN: {{buffer|flat}}_load_dwordx2
; SI-SAFE: v_min_legacy_f32_e32
; SI-SAFE: v_min_legacy_f32_e32

; VI-SAFE v_cmp_lt_f32_e32
; VI-SAFE: v_cndmask_b32_e32
; VI-SAFE v_cmp_lt_f32_e32
; VI-SAFE: v_cndmask_b32_e32

; GCN-NONAN: v_min_f32_e32
; GCN-NONAN: v_min_f32_e32
Expand All @@ -171,13 +220,24 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)*
}

; FUNC-LABEL: {{^}}test_fmin_legacy_ult_v3f32:
; GCN-SAFE: v_min_legacy_f32_e32
; GCN-SAFE: v_min_legacy_f32_e32
; GCN-SAFE: v_min_legacy_f32_e32
; SI-SAFE: v_min_legacy_f32_e32
; SI-SAFE: v_min_legacy_f32_e32
; SI-SAFE: v_min_legacy_f32_e32
; SI-SAFE-NOT: v_min_

; VI-SAFE: v_cmp_nge_f32_e32
; VI-SAFE: v_cndmask_b32_e32
; VI-SAFE: v_cmp_nge_f32_e32
; VI-SAFE: v_cndmask_b32_e32
; VI-SAFE: v_cmp_nge_f32_e32
; VI-SAFE: v_cndmask_b32_e32
; VI-NOT: v_cmp
; VI-NOT: v_cndmask

; GCN-NONAN: v_min_f32_e32
; GCN-NONAN: v_min_f32_e32
; GCN-NONAN: v_min_f32_e32
; GCN-NONAN-NOT: v_min_
define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
%tid = call i32 @llvm.r600.read.tidig.x() #1
%gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
Expand All @@ -193,8 +253,8 @@ define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)*
}

; FUNC-LABEL: {{^}}test_fmin_legacy_ole_f32_multi_use:
; GCN: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN-NOT: v_min
; GCN: v_cmp_le_f32
; GCN-NEXT: v_cndmask_b32
Expand Down
36 changes: 28 additions & 8 deletions llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,35 @@ declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0

; FUNC-LABEL: @test_fmin_f64
; SI: v_min_f64
define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
; FUNC-LABEL: {{^}}test_fmin_f64_ieee:
; SI: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]]
; SI: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]]
; SI-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]]
; SI-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]]
; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]]
define amdgpu_kernel void @test_fmin_f64_ieee([8 x i32], double %a, [8 x i32], double %b) nounwind {
%val = call double @llvm.minnum.f64(double %a, double %b) #0
store double %val, double addrspace(1)* undef, align 8
ret void
}

; FUNC-LABEL: {{^}}test_fmin_f64_no_ieee:
; SI: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]]
; SI: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]]
; SI-NOT: [[VAL0]]
; SI-NOT: [[VAL1]]
; SI: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]]
; SI-NOT: [[RESULT]]
; SI: ds_write_b64 v{{[0-9]+}}, [[RESULT]]
define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind {
%a = load volatile double, double addrspace(3)* undef
%b = load volatile double, double addrspace(3)* undef
%val = call double @llvm.minnum.f64(double %a, double %b) #0
store double %val, double addrspace(1)* %out, align 8
store volatile double %val, double addrspace(3)* undef
ret void
}

; FUNC-LABEL: @test_fmin_v2f64
; FUNC-LABEL: {{^}}test_fmin_v2f64:
; SI: v_min_f64
; SI: v_min_f64
define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
Expand All @@ -24,7 +44,7 @@ define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x
ret void
}

; FUNC-LABEL: @test_fmin_v4f64
; FUNC-LABEL: {{^}}test_fmin_v4f64:
; SI: v_min_f64
; SI: v_min_f64
; SI: v_min_f64
Expand All @@ -35,7 +55,7 @@ define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x
ret void
}

; FUNC-LABEL: @test_fmin_v8f64
; FUNC-LABEL: {{^}}test_fmin_v8f64:
; SI: v_min_f64
; SI: v_min_f64
; SI: v_min_f64
Expand All @@ -50,7 +70,7 @@ define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x
ret void
}

; FUNC-LABEL: @test_fmin_v16f64
; FUNC-LABEL: {{^}}test_fmin_v16f64:
; SI: v_min_f64
; SI: v_min_f64
; SI: v_min_f64
Expand Down
77 changes: 52 additions & 25 deletions llvm/test/CodeGen/AMDGPU/fminnum.ll
Original file line number Diff line number Diff line change
@@ -1,14 +1,45 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s

; GCN-LABEL: {{^}}test_fmin_f32:
; GCN: v_min_f32_e32
define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 {
%val = call float @llvm.minnum.f32(float %a, float %b)
; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on:
; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
; GCN: v_mul_f32_e64 [[QUIET1:v[0-9]+]], 1.0, s{{[0-9]+}}
; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET1]], [[QUIET0]]
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @test_fmin_f32_ieee_mode_on(float addrspace(1)* %out, float %a, float %b) #0 {
%val = call float @llvm.minnum.f32(float %a, float %b) #1
store float %val, float addrspace(1)* %out, align 4
ret void
}

; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_on:
; GCN: s_waitcnt
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64
define float @test_fmin_nnan_f32_ieee_mode_on(float %a, float %b) #0 {
%val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
ret float %val
}

; GCN-LABEL: {{^}}test_fmin_nnan_f32_ieee_mode_off:
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: ; return
define amdgpu_ps float @test_fmin_nnan_f32_ieee_mode_off(float %a, float %b) #0 {
%val = call nnan float @llvm.minnum.f32(float %a, float %b) #1
ret float %val
}

; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_off:
; GCN: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: ; return
define amdgpu_ps float @test_fmin_f32_ieee_mode_off(float %a, float %b) #0 {
%val = call float @llvm.minnum.f32(float %a, float %b) #1
ret float %val
}

; GCN-LABEL: {{^}}test_fmin_v2f32:
; GCN: v_min_f32_e32
; GCN: v_min_f32_e32
Expand Down Expand Up @@ -147,38 +178,34 @@ define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fmin_var_immediate_f32:
; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.minnum.f32(float %a, float 2.0)
store float %val, float addrspace(1)* %out, align 4
ret void
; GCN-LABEL: {{^}}fmin_var_immediate_f32_no_ieee:
; GCN: v_min_f32_e32 v0, 2.0, v0
define amdgpu_ps float @fmin_var_immediate_f32_no_ieee(float %a) #0 {
%val = call float @llvm.minnum.f32(float %a, float 2.0) #1
ret float %val
}

; GCN-LABEL: {{^}}fmin_immediate_var_f32:
; GCN-LABEL: {{^}}fmin_immediate_var_f32_no_ieee:
; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0
define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.minnum.f32(float 2.0, float %a)
store float %val, float addrspace(1)* %out, align 4
ret void
define amdgpu_ps float @fmin_immediate_var_f32_no_ieee(float inreg %a) #0 {
%val = call float @llvm.minnum.f32(float 2.0, float %a) #1
ret float %val
}

; GCN-LABEL: {{^}}fmin_var_literal_f32:
; GCN-LABEL: {{^}}fmin_var_literal_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.minnum.f32(float %a, float 99.0)
store float %val, float addrspace(1)* %out, align 4
ret void
define amdgpu_ps float @fmin_var_literal_f32_no_ieee(float inreg %a) #0 {
%val = call float @llvm.minnum.f32(float %a, float 99.0) #1
ret float %val
}

; GCN-LABEL: {{^}}fmin_literal_var_f32:
; GCN-LABEL: {{^}}fmin_literal_var_f32_no_ieee:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000
; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]]
define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 {
%val = call float @llvm.minnum.f32(float 99.0, float %a)
store float %val, float addrspace(1)* %out, align 4
ret void
define amdgpu_ps float @fmin_literal_var_f32_no_ieee(float inreg %a) #0 {
%val = call float @llvm.minnum.f32(float 99.0, float %a) #1
ret float %val
}

; GCN-LABEL: {{^}}test_func_fmin_v3f32:
Expand Down
311 changes: 243 additions & 68 deletions llvm/test/CodeGen/AMDGPU/fneg-combines.ll

Large diffs are not rendered by default.

69 changes: 51 additions & 18 deletions llvm/test/CodeGen/AMDGPU/known-never-snan.ll
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%b.nnan.add = fadd nnan float %b, 1.0
Expand All @@ -110,14 +109,46 @@ define float @v_test_known_not_snan_minnum_input_fmed3_r_i_i_f32(float %a, float
ret float %med
}

define float @v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src0_input_fmed3_r_i_i_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%b.nsnan = fadd float %b, 1.0
%known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nsnan)
%max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
%med = call float @llvm.minnum.f32(float %max, float 4.0)
ret float %med
}

define float @v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
; GCN-LABEL: v_test_known_not_minnum_maybe_nan_src1_input_fmed3_r_i_i_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nsnan = fadd float %a, 1.0
%known.not.snan = call float @llvm.minnum.f32(float %a.nsnan, float %b)
%max = call float @llvm.maxnum.f32(float %known.not.snan, float 2.0)
%med = call float @llvm.minnum.f32(float %max, float 4.0)
ret float %med
}

define float @v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b) #0 {
; GCN-LABEL: v_minnum_possible_nan_lhs_input_fmed3_r_i_i_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%b.nnan.add = fadd nnan float %b, 1.0
%known.not.snan = call float @llvm.minnum.f32(float %a, float %b.nnan.add)
Expand All @@ -131,9 +162,9 @@ define float @v_minnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%known.not.snan = call float @llvm.minnum.f32(float %a.nnan.add, float %b)
Expand All @@ -148,8 +179,8 @@ define float @v_test_known_not_snan_maxnum_input_fmed3_r_i_i_f32(float %a, float
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%b.nnan.add = fadd nnan float %b, 1.0
Expand All @@ -164,8 +195,9 @@ define float @v_maxnum_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%b.nnan.add = fadd nnan float %b, 1.0
%known.not.snan = call float @llvm.maxnum.f32(float %a, float %b.nnan.add)
Expand All @@ -179,8 +211,9 @@ define float @v_maxnum_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_max3_f32 v0, v0, v1, 2.0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%known.not.snan = call float @llvm.maxnum.f32(float %a.nnan.add, float %b)
Expand Down Expand Up @@ -215,8 +248,8 @@ define float @v_select_possible_nan_lhs_input_fmed3_r_i_i_f32(float %a, float %b
; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%b.nnan.add = fadd nnan float %b, 1.0
%cmp = icmp eq i32 %c, 0
Expand All @@ -233,8 +266,8 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%a.nnan.add = fdiv nnan float 1.0, %a
%cmp = icmp eq i32 %c, 0
Expand Down Expand Up @@ -494,6 +527,7 @@ define float @v_test_known_not_snan_fmed3_input_fmed3_r_i_i_f32(float %a, float
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_med3_f32 v0, v0, v1, v2
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%known.not.snan = call float @llvm.amdgcn.fmed3.f32(float %a, float %b, float %c)
Expand All @@ -507,8 +541,7 @@ define float @v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32(float %a, float
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_min3_f32 v0, v0, v1, v2
; GCN-NEXT: v_max_f32_e32 v0, 2.0, v0
; GCN-NEXT: v_min_f32_e32 v0, 4.0, v0
; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
; GCN-NEXT: s_setpc_b64 s[30:31]
%min0 = call float @llvm.minnum.f32(float %a, float %b)
%known.not.snan = call float @llvm.minnum.f32(float %min0, float %c)
Expand Down
782 changes: 650 additions & 132 deletions llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll

Large diffs are not rendered by default.

836 changes: 704 additions & 132 deletions llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll

Large diffs are not rendered by default.

112 changes: 91 additions & 21 deletions llvm/test/CodeGen/AMDGPU/reduction.ll
Original file line number Diff line number Diff line change
Expand Up @@ -434,12 +434,23 @@ entry:
}

; GCN-LABEL: {{^}}reduction_maxnum_v4f16:
; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

; VI: v_max_f16_sdwa
; VI-NEXT: v_max_f16_e32
; VI-NEXT: v_max_f16_e32
; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}

; FIXME: Extra canonicalize leftover
; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]

; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1

; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_maxnum_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Expand All @@ -451,12 +462,24 @@ entry:
}

; GCN-LABEL: {{^}}reduction_minnum_v4f16:
; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}

; FIXME: Extra canonicalize leftover
; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]

; VI: v_min_f16_sdwa
; VI-NEXT: v_min_f16_e32
; VI-NEXT: v_min_f16_e32

; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1

; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_minnum_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Expand All @@ -467,13 +490,36 @@ entry:
ret half %res
}

; FIXME: Need to preserve fast math flags when fmaxnum matched
; directly from the IR to avoid unnecessary quieting.

; GCN-LABEL: {{^}}reduction_fast_max_pattern_v4f16:
; GFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; XGFX9: v_pk_max_f16 [[MAX:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; XGFX9-NEXT: v_max_f16_sdwa v{{[0-9]+}}, [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

; XVI: s_waitcnt
; XVI-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; XVI-NEXT: v_max_f16_e32 v0, v0, v1
; XVI-NEXT: v_max_f16_e32 v0, v0, v2
; XVI-NEXT: s_setpc_b64

; VI: v_max_f16_sdwa
; VI-NEXT: v_max_f16_e32
; VI-NEXT: v_max_f16_e32
; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
; GFX9-NEXT: v_pk_max_f16 [[MAX:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}

; FIXME: Extra canonicalize leftover
; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MAX]], [[MAX]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_max_f16_e32 v0, [[MAX]], [[TMP]]

; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1

; VI-DAG: v_max_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
; VI-DAG: v_max_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
; VI: v_max_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Expand All @@ -486,13 +532,37 @@ entry:
ret half %res
}

; FIXME: Need to preserve fast math flags when fmaxnum matched
; directly from the IR to avoid unnecessary quieting.

; GCN-LABEL: {{^}}reduction_fast_min_pattern_v4f16:
; GFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; XGFX9: v_pk_min_f16 [[MIN:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; XGFX9-NEXT: v_min_f16_sdwa v{{[0-9]+}}, [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1

; XVI: s_waitcnt
; XVI-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; XVI-NEXT: v_min_f16_e32 v0, v0, v1
; XVI-NEXT: v_min_f16_e32 v0, v0, v2
; XVI-NEXT: s_setpc_b64

; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_max_f16 [[CANON1:v[0-9]+]], v1, v1
; GFX9-NEXT: v_pk_max_f16 [[CANON0:v[0-9]+]], v0, v0
; GFX9-NEXT: v_pk_min_f16 [[MIN:v[0-9]+]], [[CANON0]], [[CANON1]]{{$}}

; FIXME: Extra canonicalize leftover
; GFX9-NEXT: v_max_f16_sdwa [[TMP:v[0-9]+]], [[MIN]], [[MIN]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: v_min_f16_e32 v0, [[MIN]], [[TMP]]


; VI-DAG: v_max_f16_sdwa [[CANON1:v[0-9]+]], v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_sdwa [[CANON3:v[0-9]+]], v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON0:v[0-9]+]], v0, v0
; VI-DAG: v_max_f16_e32 [[CANON2:v[0-9]+]], v1, v1

; VI: v_min_f16_sdwa
; VI-NEXT: v_min_f16_e32
; VI-NEXT: v_min_f16_e32
; VI-DAG: v_min_f16_e32 [[MAX0:v[0-9]+]], [[CANON1]], [[CANON3]]
; VI-DAG: v_min_f16_e32 [[MAX1:v[0-9]+]], [[CANON0]], [[CANON2]]
; VI: v_min_f16_e32 v0, [[MAX1]], [[MAX0]]
define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) {
entry:
%rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
Expand Down