Skip to content

Commit

Permalink
[AMDGPU] Min/max changes for GFX12 (#75214)
Browse files Browse the repository at this point in the history
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
  • Loading branch information
piotrAMD and rampitec committed Dec 13, 2023
1 parent c9e1003 commit 6eec801
Show file tree
Hide file tree
Showing 30 changed files with 4,922 additions and 17 deletions.
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) {
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_FSIN:
case AMDGPU::G_FPEXT:
case AMDGPU::G_INTRINSIC_TRUNC:
Expand Down Expand Up @@ -174,6 +176,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return AMDGPU::G_FMINNUM_IEEE;
case AMDGPU::G_FMINNUM_IEEE:
return AMDGPU::G_FMAXNUM_IEEE;
case AMDGPU::G_FMAXIMUM:
return AMDGPU::G_FMINIMUM;
case AMDGPU::G_FMINIMUM:
return AMDGPU::G_FMAXIMUM;
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
return AMDGPU::G_AMDGPU_FMIN_LEGACY;
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
Expand Down Expand Up @@ -207,6 +213,8 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
// 0 doesn't have a negated inline immediate.
Expand Down Expand Up @@ -304,6 +312,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
NegateOperand(MatchInfo->getOperand(1));
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,8 @@ static bool fnegFoldsIntoOpcode(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
case ISD::SELECT:
case ISD::FSIN:
case ISD::FTRUNC:
Expand Down Expand Up @@ -4572,6 +4574,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return ISD::FMINNUM_IEEE;
case ISD::FMINNUM_IEEE:
return ISD::FMAXNUM_IEEE;
case ISD::FMAXIMUM:
return ISD::FMINIMUM;
case ISD::FMINIMUM:
return ISD::FMAXIMUM;
case AMDGPUISD::FMAX_LEGACY:
return AMDGPUISD::FMIN_LEGACY;
case AMDGPUISD::FMIN_LEGACY:
Expand Down Expand Up @@ -4695,6 +4701,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
case AMDGPUISD::FMAX_LEGACY:
case AMDGPUISD::FMIN_LEGACY: {
// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
Expand Down Expand Up @@ -5305,6 +5313,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
NODE_NAME_CASE(FMAXIMUM3)
NODE_NAME_CASE(FMINIMUM3)
NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
Expand Down Expand Up @@ -5759,6 +5769,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::FMED3:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMINIMUM3:
case AMDGPUISD::FMAXIMUM3:
case AMDGPUISD::FMAD_FTZ: {
if (SNaN)
return true;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,8 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
FMAXIMUM3,
FMINIMUM3,
FDOT2,
URECIP,
DIV_SCALE,
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = max(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
def AMDGPUfmaximum3 : SDNode<"AMDGPUISD::FMAXIMUM3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = max(a, b, c) a, b, and c are signed ints
def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
Expand All @@ -185,6 +190,11 @@ def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = min(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
def AMDGPUfminimum3 : SDNode<"AMDGPUISD::FMINIMUM3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;

// out = min(a, b, c) a, b and c are signed ints
def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ def umin_oneuse : HasOneUseBinOp<umin>;

def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
def fminimum_oneuse : HasOneUseBinOp<fminimum>;
def fmaximum_oneuse : HasOneUseBinOp<fmaximum>;

def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
Expand Down
27 changes: 16 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1959,20 +1959,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0)
.scalarize(0);

getActionDefinitionsBuilder({
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,
getActionDefinitionsBuilder(
{// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,

G_ATOMIC_CMPXCHG_WITH_SUCCESS,
G_ATOMICRMW_NAND,
G_ATOMICRMW_FSUB,
G_READ_REGISTER,
G_WRITE_REGISTER,
G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
G_READ_REGISTER, G_WRITE_REGISTER,

G_SADDO, G_SSUBO,
G_SADDO, G_SSUBO})
.lower();

// TODO: Implement
G_FMINIMUM, G_FMAXIMUM}).lower();
if (ST.hasIEEEMinMax()) {
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
.legalFor(FPTypesPK16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0);
} else {
// TODO: Implement
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
}

getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
.lower();
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3727,14 +3727,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_STRICT_FADD:
case AMDGPU::G_STRICT_FSUB:
case AMDGPU::G_STRICT_FMUL:
case AMDGPU::G_STRICT_FMA: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
isSALUMapping(MI))
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
unsigned Size = Ty.getSizeInBits();
if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
(Size == 32 || Size == 16) && isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -1218,6 +1218,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true if the target has IEEE kernel descriptor mode bit
bool hasIEEEMode() const { return getGeneration() < GFX12; }

// \returns true if the target has IEEE fminimum/fmaximum instructions
bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }

// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
bool hasRrWGMode() const { return getGeneration() >= GFX12; }

Expand Down
26 changes: 23 additions & 3 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);

if (Subtarget->hasIEEEMinMax())
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
{MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);

setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::v2i16, MVT::v2f16, MVT::i128},
Expand Down Expand Up @@ -800,6 +804,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FMAXNUM,
ISD::FMINNUM_IEEE,
ISD::FMAXNUM_IEEE,
ISD::FMINIMUM,
ISD::FMAXIMUM,
ISD::FMA,
ISD::SMIN,
ISD::SMAX,
Expand Down Expand Up @@ -11786,10 +11792,14 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
case ISD::FMINIMUM:
case ISD::FMAXIMUM:
case AMDGPUISD::CLAMP:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMIN3: {
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAXIMUM3:
case AMDGPUISD::FMINIMUM3: {
// FIXME: Shouldn't treat the generic operations different based these.
// However, we aren't really required to flush the result from
// minnum/maxnum..
Expand Down Expand Up @@ -11943,7 +11953,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE: {
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FMINIMUM:
case AMDGPU::G_FMAXIMUM: {
if (Subtarget->supportsMinMaxDenormModes() ||
// FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(MRI.getType(Reg), MF))
Expand Down Expand Up @@ -12131,13 +12143,17 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
case ISD::FMAXIMUM:
return AMDGPUISD::FMAXIMUM3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
return AMDGPUISD::UMAX3;
case ISD::FMINNUM:
case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
case ISD::FMINIMUM:
return AMDGPUISD::FMINIMUM3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
case ISD::UMIN:
Expand Down Expand Up @@ -12497,7 +12513,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE: {
case ISD::FMINNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM: {
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
Vec.getOperand(0), Idx);
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
Expand Down Expand Up @@ -13759,6 +13777,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
case ISD::FMAXIMUM:
case ISD::FMINIMUM:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5255,11 +5255,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
Expand Down Expand Up @@ -7101,6 +7105,26 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
}
case AMDGPU::S_MINIMUM_F32:
case AMDGPU::S_MAXIMUM_F32:
case AMDGPU::S_MINIMUM_F16:
case AMDGPU::S_MAXIMUM_F16: {
const DebugLoc &DL = Inst.getDebugLoc();
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
.addImm(0) // src0_modifiers
.add(Inst.getOperand(1))
.addImm(0) // src1_modifiers
.add(Inst.getOperand(2))
.addImm(0) // clamp
.addImm(0); // omod
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);

legalizeOperands(*NewInstr, MDT);
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
Inst.eraseFromParent();
return;
}
}

if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -3441,6 +3441,12 @@ defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>;
defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
} // End Predicates = [isGFX9Plus]

let OtherPredicates = [isGFX12Plus] in {
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
}

// Convert a floating-point power of 2 to the integer exponent.
def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/SOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,15 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
} // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
// Uses = [MODE], SchedRW = [WriteSFPU]

// On GFX12 MIN/MAX instructions do not read MODE register.
let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 1, isCommutable = 1,
isReMaterializable = 1, SchedRW = [WriteSFPU] in {
def S_MINIMUM_F32 : SOP2_F32_Inst<"s_minimum_f32", fminimum>;
def S_MAXIMUM_F32 : SOP2_F32_Inst<"s_maximum_f32", fmaximum>;
def S_MINIMUM_F16 : SOP2_F16_Inst<"s_minimum_f16", fminimum>;
def S_MAXIMUM_F16 : SOP2_F16_Inst<"s_maximum_f16", fmaximum>;
}

//===----------------------------------------------------------------------===//
// SOPK Instructions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -2017,6 +2026,10 @@ defm S_MIN_NUM_F32 : SOP2_Real_Renamed_gfx12<0x042, S_MIN_F32, "s_min_num_f32">;
defm S_MAX_NUM_F32 : SOP2_Real_Renamed_gfx12<0x043, S_MAX_F32, "s_max_num_f32">;
defm S_MIN_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04b, S_MIN_F16, "s_min_num_f16">;
defm S_MAX_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04c, S_MAX_F16, "s_max_num_f16">;
defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>;
defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>;
defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>;
defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;

defm S_ADD_CO_U32 : SOP2_Real_Renamed_gfx12<0x000, S_ADD_U32, "s_add_co_u32">;
defm S_SUB_CO_U32 : SOP2_Real_Renamed_gfx12<0x001, S_SUB_U32, "s_sub_co_u32">;
Expand Down

0 comments on commit 6eec801

Please sign in to comment.