-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Update uses of new VOP2 pseudos for GFX12 #78155
Merged
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
New pseudos were added for instructions that were natively VOP3 on GFX11: V_ADD_F64_pseudo, V_MUL_F64_pseudo, V_MIN_NUM_F64, V_MAX_NUM_F64, V_LSHLREV_B64_pseudo
jayfoad
changed the title
gfx12 new pseudos
[AMDGPU] Update uses of new VOP2 pseudos for GFX12
Jan 15, 2024
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) Changes
Patch is 98.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78155.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index aa7639a0f18665..2862a7787e75a3 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1498,6 +1498,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F16_t16_e64:
case AMDGPU::V_MAX_F16_fake16_e64:
case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
return nullptr;
@@ -1567,7 +1568,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
static int getOModValue(unsigned Opc, int64_t Val) {
switch (Opc) {
- case AMDGPU::V_MUL_F64_e64: {
+ case AMDGPU::V_MUL_F64_e64:
+ case AMDGPU::V_MUL_F64_pseudo_e64: {
switch (Val) {
case 0x3fe0000000000000: // 0.5
return SIOutMods::DIV2;
@@ -1618,6 +1620,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
unsigned Op = MI.getOpcode();
switch (Op) {
case AMDGPU::V_MUL_F64_e64:
+ case AMDGPU::V_MUL_F64_pseudo_e64:
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_t16_e64:
case AMDGPU::V_MUL_F16_fake16_e64:
@@ -1625,8 +1628,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
- ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
- Op == AMDGPU::V_MUL_F16_t16_e64 ||
+ ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
+ Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
Op == AMDGPU::V_MUL_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
@@ -1655,6 +1658,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
return std::pair(RegOp, OMod);
}
case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64:
case AMDGPU::V_ADD_F16_t16_e64:
@@ -1662,8 +1666,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
- ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
- Op == AMDGPU::V_ADD_F16_t16_e64 ||
+ ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
+ Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
Op == AMDGPU::V_ADD_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b4bd46d33c1f10..ffa7952888c355 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1862,7 +1862,10 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
>;
def : ClampPat<V_MAX_F32_e64, f32>;
+let SubtargetPredicate = isNotGFX12Plus in
def : ClampPat<V_MAX_F64_e64, f64>;
+let SubtargetPredicate = isGFX12Plus in
+def : ClampPat<V_MAX_NUM_F64_e64, f64>;
let SubtargetPredicate = NotHasTrue16BitInsts in
def : ClampPat<V_MAX_F16_e64, f16>;
let SubtargetPredicate = UseRealTrue16Insts in
@@ -2990,10 +2993,12 @@ def : GCNPat<
}
// TODO: Handle fneg like other types.
+let SubtargetPredicate = isNotGFX12Plus in {
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src)
>;
+}
} // End AddedComplexity = -5
multiclass SelectCanonicalizeAsMax<
@@ -3009,7 +3014,13 @@ multiclass SelectCanonicalizeAsMax<
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> {
- let OtherPredicates = f64_preds;
+ let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]);
+ }
+
+ def : GCNPat<
+ (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+ (V_MAX_NUM_F64_e64 $src_mods, $src, $src_mods, $src)> {
+ let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]);
}
def : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index b95231fd8880f5..92660b9a646ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -857,7 +857,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
+; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -938,7 +938,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
+; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1020,7 +1020,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_waitcnt lgkmcnt(0)
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
; GFX12-NEXT: s_waitcnt vmcnt(0)
-; GFX12-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
+; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 41b436473f6521..fc4bc7595da5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11PLUS,GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11PLUS,GFX12 %s
declare float @llvm.fabs.f32(float) #0
declare float @llvm.canonicalize.f32(float) #0
@@ -55,6 +56,19 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%canonicalized = call float @llvm.canonicalize.f32(float %val)
store float %canonicalized, ptr addrspace(1) %out
@@ -106,6 +120,17 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: s_test_canonicalize_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float %val)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -147,6 +172,19 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fabs_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fabs = call float @llvm.fabs.f32(float %val)
%canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
@@ -190,6 +228,19 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fabs = call float @llvm.fabs.f32(float %val)
%val.fabs.fneg = fneg float %val.fabs
@@ -234,6 +285,19 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fneg_var_f32:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT: s_waitcnt vmcnt(0)
+; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %out
%val.fneg = fneg float %val
%canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
@@ -260,15 +324,15 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_undef_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_undef_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float undef)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -293,15 +357,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_p0_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_p0_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 0.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -327,16 +391,16 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_n0_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_n0_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: v_bfrev_b32_e32 v1, 1
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float -0.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -362,15 +426,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_p1_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_p1_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 1.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -396,15 +460,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_n1_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_n1_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float -1.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -430,15 +494,15 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_fold_canonicalize_literal_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_literal_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float 16.0)
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -463,15 +527,15 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
+; GFX11PLUS-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT: v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT: s_nop 0
+; GFX11PLUS-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -509,6 +573,17 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
+; GFX12-NEXT: s_waitcnt lgkmcnt(0)
+; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
store float %canonicalized, ptr addrspace(1) %out
ret void
@@ -546,6 +621,17 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+;
+; GFX12-LABEL:...
[truncated]
|
jayfoad
requested review from
piotrAMD,
arsenm,
kosarev,
mbrkusanin and
mariusz-sikora-at-amd
January 18, 2024 10:56
mbrkusanin
approved these changes
Jan 18, 2024
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
New pseudos were added for instructions that were natively VOP3 on
GFX11: V_ADD_F64_pseudo, V_MUL_F64_pseudo, V_MIN_NUM_F64, V_MAX_NUM_F64,
V_LSHLREV_B64_pseudo