-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Min/max changes for GFX12 #75214
Merged
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Piotr Sobczak (piotrAMD) ChangesPatch is 257.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75214.diff 30 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index d48916670112c..69dc78d33c838 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -29,6 +29,8 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) {
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_FSIN:
case AMDGPU::G_FPEXT:
case AMDGPU::G_INTRINSIC_TRUNC:
@@ -174,6 +176,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return AMDGPU::G_FMINNUM_IEEE;
case AMDGPU::G_FMINNUM_IEEE:
return AMDGPU::G_FMAXNUM_IEEE;
+ case AMDGPU::G_FMAXIMUM:
+ return AMDGPU::G_FMINIMUM;
+ case AMDGPU::G_FMINIMUM:
+ return AMDGPU::G_FMAXIMUM;
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
return AMDGPU::G_AMDGPU_FMIN_LEGACY;
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
@@ -207,6 +213,8 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
// 0 doesn't have a negated inline immediate.
@@ -304,6 +312,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
NegateOperand(MatchInfo->getOperand(1));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index fcbdf51b03c1f..9d7443012e3da 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -585,6 +585,8 @@ static bool fnegFoldsIntoOpcode(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case ISD::SELECT:
case ISD::FSIN:
case ISD::FTRUNC:
@@ -4572,6 +4574,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return ISD::FMINNUM_IEEE;
case ISD::FMINNUM_IEEE:
return ISD::FMAXNUM_IEEE;
+ case ISD::FMAXIMUM:
+ return ISD::FMINIMUM;
+ case ISD::FMINIMUM:
+ return ISD::FMAXIMUM;
case AMDGPUISD::FMAX_LEGACY:
return AMDGPUISD::FMIN_LEGACY;
case AMDGPUISD::FMIN_LEGACY:
@@ -4695,6 +4701,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case AMDGPUISD::FMAX_LEGACY:
case AMDGPUISD::FMIN_LEGACY: {
// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
@@ -5305,6 +5313,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
+ NODE_NAME_CASE(FMAXIMUM3)
+ NODE_NAME_CASE(FMINIMUM3)
NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
@@ -5759,6 +5769,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::FMED3:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMINIMUM3:
+ case AMDGPUISD::FMAXIMUM3:
case AMDGPUISD::FMAD_FTZ: {
if (SNaN)
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 6841067e31b3b..827fb106b5519 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -449,6 +449,8 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
+ FMAXIMUM3,
+ FMINIMUM3,
FDOT2,
URECIP,
DIV_SCALE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index fd38739876c4d..82f58ea38fd0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -170,6 +170,11 @@ def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;
+// out = max(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
+def AMDGPUfmaximum3 : SDNode<"AMDGPUISD::FMAXIMUM3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
// out = max(a, b, c) a, b, and c are signed ints
def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
@@ -185,6 +190,11 @@ def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;
+// out = min(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
+def AMDGPUfminimum3 : SDNode<"AMDGPUISD::FMINIMUM3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
// out = min(a, b, c) a, b and c are signed ints
def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index d2d09a0b1fc54..121026aca6035 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -252,6 +252,8 @@ def umin_oneuse : HasOneUseBinOp<umin>;
def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+def fminimum_oneuse : HasOneUseBinOp<fminimum>;
+def fmaximum_oneuse : HasOneUseBinOp<fmaximum>;
def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ccdbd3216e260..f35d1bfa32cb3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1959,10 +1959,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_READ_REGISTER,
G_WRITE_REGISTER,
- G_SADDO, G_SSUBO,
+ G_SADDO, G_SSUBO}).lower();
- // TODO: Implement
- G_FMINIMUM, G_FMAXIMUM}).lower();
+ if (ST.hasIEEEMinMax()) {
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .legalFor(FPTypesPK16)
+ .clampMaxNumElements(0, S16, 2)
+ .scalarize(0);
+ } else {
+ // TODO: Implement
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .lower();
+ }
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
.lower();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79..df8e0c9400678 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3727,14 +3727,17 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_INTRINSIC_ROUNDEVEN:
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_STRICT_FADD:
case AMDGPU::G_STRICT_FSUB:
case AMDGPU::G_STRICT_FMUL:
case AMDGPU::G_STRICT_FMA: {
- unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
- isSALUMapping(MI))
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ unsigned Size = Ty.getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
+ (Size == 32 || Size == 16) && isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6..08e37f09b5c79 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1210,6 +1210,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
// \returns true is CSUB atomics support a no-return form.
bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
+ // \returns true if the target has IEEE fminimum/fmaximum instructions
+ bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3626715acb3c1..176eee2ac120b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -763,6 +763,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
+ if (Subtarget->hasIEEEMinMax())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
+ {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -800,6 +804,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FMAXNUM,
ISD::FMINNUM_IEEE,
ISD::FMAXNUM_IEEE,
+ ISD::FMINIMUM,
+ ISD::FMAXIMUM,
ISD::FMA,
ISD::SMIN,
ISD::SMAX,
@@ -11775,10 +11781,14 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case AMDGPUISD::CLAMP:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMIN3: {
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAXIMUM3:
+ case AMDGPUISD::FMINIMUM3: {
// FIXME: Shouldn't treat the generic operations different based these.
// However, we aren't really required to flush the result from
// minnum/maxnum..
@@ -11932,7 +11942,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
- case AMDGPU::G_FMAXNUM_IEEE: {
+ case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM: {
if (Subtarget->supportsMinMaxDenormModes() ||
// FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(MRI.getType(Reg), MF))
@@ -12120,6 +12132,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
+ case ISD::FMAXIMUM:
+ return AMDGPUISD::FMAXIMUM3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
@@ -12127,6 +12141,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMINNUM:
case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
+ case ISD::FMINIMUM:
+ return AMDGPUISD::FMINIMUM3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
case ISD::UMIN:
@@ -12486,7 +12502,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
- case ISD::FMINNUM_IEEE: {
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM: {
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
Vec.getOperand(0), Idx);
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
@@ -13748,6 +13766,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d4e4526795f3b..f01beb30a11e4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5255,11 +5255,15 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
+ case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
+ case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
+ case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
+ case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
@@ -7101,6 +7105,26 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
}
+ case AMDGPU::S_MINIMUM_F32:
+ case AMDGPU::S_MAXIMUM_F32:
+ case AMDGPU::S_MINIMUM_F16:
+ case AMDGPU::S_MAXIMUM_F16: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(1))
+ .addImm(0) // src1_modifiers
+ .add(Inst.getOperand(2))
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+
+ legalizeOperands(*NewInstr, MDT);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
+ return;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 9362fe5d9678b..f9bc623abcd04 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3441,6 +3441,12 @@ defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>;
defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
} // End Predicates = [isGFX9Plus]
+let OtherPredicates = [isGFX12Plus] in {
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+}
// Convert a floating-point power of 2 to the integer exponent.
def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 9ff64968ef01b..b9a488f9cf7b3 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -847,6 +847,15 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
} // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
// Uses = [MODE], SchedRW = [WriteSFPU]
+// On GFX12 MIN/MAX instructions do not read MODE register.
+let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 1, isCommutable = 1,
+ isReMaterializable = 1, SchedRW = [WriteSFPU] in {
+ def S_MINIMUM_F32 : SOP2_F32_Inst<"s_minimum_f32", fminimum>;
+ def S_MAXIMUM_F32 : SOP2_F32_Inst<"s_maximum_f32", fmaximum>;
+ def S_MINIMUM_F16 : SOP2_F16_Inst<"s_minimum_f16", fminimum>;
+ def S_MAXIMUM_F16 : SOP2_F16_Inst<"s_maximum_f16", fmaximum>;
+}
+
//===----------------------------------------------------------------------===//
// SOPK Instructions
//===----------------------------------------------------------------------===//
@@ -2017,6 +2026,10 @@ defm S_MIN_NUM_F32 : SOP2_Real_Renamed_gfx12<0x042, S_MIN_F32, "s_min_num_f32">;
defm S_MAX_NUM_F32 : SOP2_Real_Renamed_gfx12<0x043, S_MAX_F32, "s_max_num_f32">;
defm S_MIN_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04b, S_MIN_F16, "s_min_num_f16">;
defm S_MAX_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04c, S_MAX_F16, "s_max_num_f16">;
+defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>;
+defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>;
+defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>;
+defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;
defm S_ADD_CO_U32 : SOP2_Real_Renamed_gfx12<0x000, S_ADD_U32, "s_add_co_u32">;
defm S_SUB_CO_U32 : SOP2_Real_Renamed_gfx12<0x001, S_SUB_U32, "s_sub_co_u32">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 617773b34ae98..685c9ac6a2be4 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -161,6 +161,19 @@ defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
} // End SchedRW = [WriteIntMul]
+
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
+defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
+defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
+defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
+defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaximum>;
+} // End SchedRW = [WriteDoubleAdd]
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+
} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
@@ -211,6 +224,11 @@ let mayRaiseFPException = 0 in {
defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
} // End mayRaiseFPException = 0
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+ defm V_MINIMUM3_F32 : VOP3Inst <"v_minimum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfminimum3>;
+ defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>;
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+
let isCommutable = 1 in {
defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -555,6 +573,11 @@ defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3
defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+ defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
+ defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+
defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
@@ -805,6 +828,13 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
} // End SubtargetPredicate = isGFX11Plus
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+ defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<...
[truncated]
|
|
rampitec
approved these changes
Dec 12, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.