-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU/GlobalISel: Start legalizing minimumnum and maximumnum #140900
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISel: Start legalizing minimumnum and maximumnum #140900
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis is the bare minimum to get the intrinsic to compile for AMDGPU, Just re-use the existing lowering for the old semantics for Patch is 1.06 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/140900.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 72f2ba75c927e..7b18a98d7f3ca 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3221,6 +3221,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
case TargetOpcode::G_FMAXNUM_IEEE:
case TargetOpcode::G_FMINIMUM:
case TargetOpcode::G_FMAXIMUM:
+ case TargetOpcode::G_FMINIMUMNUM:
+ case TargetOpcode::G_FMAXIMUMNUM:
case TargetOpcode::G_FDIV:
case TargetOpcode::G_FREM:
case TargetOpcode::G_FCEIL:
@@ -4591,6 +4593,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
return lowerFCopySign(MI);
case G_FMINNUM:
case G_FMAXNUM:
+ case G_FMINIMUMNUM:
+ case G_FMAXIMUMNUM:
return lowerFMinNumMaxNum(MI);
case G_MERGE_VALUES:
return lowerMergeValues(MI);
@@ -5379,6 +5383,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
case G_FMAXNUM_IEEE:
case G_FMINIMUM:
case G_FMAXIMUM:
+ case G_FMINIMUMNUM:
+ case G_FMAXIMUMNUM:
case G_FSHL:
case G_FSHR:
case G_ROTL:
@@ -6090,6 +6096,8 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
case TargetOpcode::G_FMAXNUM_IEEE:
case TargetOpcode::G_FMINIMUM:
case TargetOpcode::G_FMAXIMUM:
+ case TargetOpcode::G_FMINIMUMNUM:
+ case TargetOpcode::G_FMAXIMUMNUM:
case TargetOpcode::G_STRICT_FADD:
case TargetOpcode::G_STRICT_FSUB:
case TargetOpcode::G_STRICT_FMUL:
@@ -8139,8 +8147,27 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
- unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
- TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
+ // FIXME: fminnum/fmaxnum and fminimumnum/fmaximumnum should not have
+ // identical handling. fminimumnum/fmaximumnum also need a path that do not
+ // depend on fminnum/fmaxnum.
+
+ unsigned NewOp;
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_FMINNUM:
+ NewOp = TargetOpcode::G_FMINNUM_IEEE;
+ break;
+ case TargetOpcode::G_FMINIMUMNUM:
+ NewOp = TargetOpcode::G_FMINNUM;
+ break;
+ case TargetOpcode::G_FMAXNUM:
+ NewOp = TargetOpcode::G_FMAXNUM_IEEE;
+ break;
+ case TargetOpcode::G_FMAXIMUMNUM:
+ NewOp = TargetOpcode::G_FMAXNUM;
+ break;
+ default:
+ llvm_unreachable("unexpected min/max opcode");
+ }
auto [Dst, Src0, Src1] = MI.getFirst3Regs();
LLT Ty = MRI.getType(Dst);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 7bb461e0a239f..86e8e7e241b35 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -960,6 +960,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &MinNumMaxNum = getActionDefinitionsBuilder({
G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+ // TODO: These should be custom lowered and are directly legal with IEEE=0
+ auto &MinimumNumMaximumNum =
+ getActionDefinitionsBuilder({G_FMINIMUMNUM, G_FMAXIMUMNUM});
+
if (ST.hasVOP3PInsts()) {
MinNumMaxNum.customFor(FPTypesPK16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
@@ -976,6 +980,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
}
+ MinimumNumMaximumNum.lower();
+
if (ST.hasVOP3PInsts())
FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
@@ -2102,6 +2108,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
}
+
+
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
.lower();
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index e299f959edb08..c45d86ce306e7 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -1,106 +1,209 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-SDAG,GFX900-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX900,GFX9-GISEL,GFX900-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-SDAG,GFX950-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX9,GFX950,GFX9-GISEL,GFX950-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-SDAG,GFX11-TRUE16-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16,GFX11-GISEL,GFX11-TRUE16-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-SDAG,GFX11-FAKE16-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16,GFX11-GISEL,GFX11-FAKE16-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16,GFX12-SDAG,GFX12-TRUE16-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16,GFX12-GISEL,GFX12-TRUE16-GISEL %s
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-SDAG,GFX12-FAKE16-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s
define half @v_maximumnum_f16(half %x, half %y) {
-; GFX7-LABEL: v_maximumnum_f16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_maximumnum_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX8-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_maximumnum_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_maximumnum_f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX10-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-TRUE16-LABEL: v_maximumnum_f16:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
-; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: v_maximumnum_f16:
-; GFX11-FAKE16: ; %bb.0:
-; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-TRUE16-LABEL: v_maximumnum_f16:
-; GFX12-TRUE16: ; %bb.0:
-; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
-; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h
-; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-FAKE16-LABEL: v_maximumnum_f16:
-; GFX12-FAKE16: ; %bb.0:
-; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
-; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v1, v1, v1
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v0
-; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v0, v0, v1
-; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: v_maximumnum_f16:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: v_maximumnum_f16:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_maximumnum_f16:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-GISEL-LABEL: v_maximumnum_f16:
+; GFX8-GISEL: ; %bb.0:
+; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX8-GISEL-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX8-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-SDAG-LABEL: v_maximumnum_f16:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-SDAG-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_maximumnum_f16:
+; GFX9-GISEL: ; %bb.0:
+; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-GISEL-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX9-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_maximumnum_f16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-SDAG-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-GISEL-LABEL: v_maximumnum_f16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX10-GISEL-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX10-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-SDAG-LABEL: v_maximumnum_f16:
+; GFX11-TRUE16-SDAG: ; %bb.0:
+; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-SDAG-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-SDAG-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-SDAG-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-GISEL-LABEL: v_maximumnum_f16:
+; GFX11-TRUE16-GISEL: ; %bb.0:
+; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-GISEL-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-GISEL-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-GISEL-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-SDAG-LABEL: v_maximumnum_f16:
+; GFX11-FAKE16-SDAG: ; %bb.0:
+; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-SDAG-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-SDAG-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-SDAG-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-GISEL-LABEL: v_maximumnum_f16:
+; GFX11-FAKE16-GISEL: ; %bb.0:
+; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-GISEL-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-GISEL-NEXT: v_max_f16_e32 v1, v1, v1
+; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-GISEL-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-FAKE16-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-SDAG-LABEL: v_maximumnum_f16:
+; GFX12-TRUE16-SDAG: ; %bb.0:
+; GFX12-TRUE16-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-SDAG-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-SDAG-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-SDAG-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-GISEL-LABEL: v_maximumnum_f16:
+; GFX12-TRUE16-GISEL: ; %bb.0:
+; GFX12-TRUE16-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-GISEL-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.l
+; GFX12-TRUE16-GISEL-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l
+; GFX12-TRUE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-GISEL-NEXT: v_max_num_f16_e32 v0.l, v0.l, v0.h
+; GFX12-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-SDAG-LABEL: v_maximumnum_f16:
+; GFX12-FAKE16-SDAG: ; %bb.0:
+; GFX12-FAKE16-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-SDAG-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-SDAG-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-SDAG-NEXT: v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-GISEL-LABEL: v_maximumnum_f16:
+; GFX12-FAKE16-GISEL: ; %bb.0:
+; GFX12-FAKE16-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-GISEL-NEXT: v_max_num_f16_e32 v0, v0, v0
+; GFX12-FAKE16-GISEL-NEXT: v_max_num_f16_e32 v1, v1, v1
+; GFX12-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-GISEL-NEXT: v_max_num_f16_e32 v0, v0, v1
+; GFX12-FAKE16-GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.maximumnum.f16(half %x, half %y)
ret half %result
}
define half @v_maximumnum_f16_nnan(half %x, half %y) {
-; GFX7-LABEL: v_maximumnum_f16_nnan:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: v_maximumnum_f16_nnan:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: v_maximumnum_f16_nnan:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximumnum_f16_nnan:
; GFX8: ; %bb.0:
@@ -156,13 +259,22 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) {
}
define half @v_maximumnum_f16_1.0(half %x) {
-; GFX7-LABEL: v_maximumnum_f16_1.0:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_max_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
+; GFX7-SDAG-LABEL: v_maximumnum_f16_1.0:
+; GFX7-SDAG: ; %bb.0:
+; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_max_f32_e32 v0, 1.0, v0
+; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-GISEL-LABEL: v_maximumnum_f16_1.0:
+; GFX7-GISEL: ; %bb.0:
+; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GF...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
This is the bare minimum to get the intrinsic to compile for AMDGPU, and it's not optimal. We need to follow along closer with the existing G_FMINNUM/G_FMAXNUM with custom lowering to handle the IEEE=0 case better. Just re-use the existing lowering for the old semantics for G_FMINNUM/G_FMAXNUM. This does not change G_FMINNUM/G_FMAXNUM's treatment, nor try to handle the general expansion without an underlying min/max variant (or with G_FMINIMUM/G_FMAXIMUM).
00f5af3
to
2fa1530
Compare
fyi: I still have the SDAG implementation for |
This is the bare minimum to get the intrinsic to compile for AMDGPU,
and it's not optimal. We need to follow along closer with the existing
G_FMINNUM/G_FMAXNUM with custom lowering to handle the IEEE=0 case better.
Just re-use the existing lowering for the old semantics for
G_FMINNUM/G_FMAXNUM. This does not change G_FMINNUM/G_FMAXNUM's treatment,
nor try to handle the general expansion without an underlying min/max
variant (or with G_FMINIMUM/G_FMAXIMUM).