Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Update uses of new VOP2 pseudos for GFX12 #78155

Merged
merged 3 commits into from
Jan 18, 2024
Merged

[AMDGPU] Update uses of new VOP2 pseudos for GFX12 #78155

merged 3 commits into from
Jan 18, 2024

Conversation

jayfoad
Copy link
Contributor

@jayfoad jayfoad commented Jan 15, 2024

New pseudos were added for instructions that were natively VOP3 on
GFX11: V_ADD_F64_pseudo, V_MUL_F64_pseudo, V_MIN_NUM_F64, V_MAX_NUM_F64,
V_LSHLREV_B64_pseudo

jayfoad and others added 2 commits January 15, 2024 11:22
New pseudos were added for instructions that were natively VOP3 on
GFX11: V_ADD_F64_pseudo, V_MUL_F64_pseudo, V_MIN_NUM_F64, V_MAX_NUM_F64,
V_LSHLREV_B64_pseudo
@jayfoad jayfoad changed the title gfx12 new pseudos [AMDGPU] Update uses of new VOP2 pseudos for GFX12 Jan 15, 2024
@llvmbot
Copy link
Collaborator

llvmbot commented Jan 15, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

Changes
  • Add some GFX12 test coverage
  • [AMDGPU] Update uses of new VOP2 pseudos for GFX12

Patch is 98.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/78155.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+9-5)
  • (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+12-1)
  • (modified) llvm/test/CodeGen/AMDGPU/clamp.ll (+3-3)
  • (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+640-308)
  • (modified) llvm/test/CodeGen/AMDGPU/omod.ll (+303-187)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index aa7639a0f18665..2862a7787e75a3 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1498,6 +1498,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
   case AMDGPU::V_MAX_F16_t16_e64:
   case AMDGPU::V_MAX_F16_fake16_e64:
   case AMDGPU::V_MAX_F64_e64:
+  case AMDGPU::V_MAX_NUM_F64_e64:
   case AMDGPU::V_PK_MAX_F16: {
     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
       return nullptr;
@@ -1567,7 +1568,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
 
 static int getOModValue(unsigned Opc, int64_t Val) {
   switch (Opc) {
-  case AMDGPU::V_MUL_F64_e64: {
+  case AMDGPU::V_MUL_F64_e64:
+  case AMDGPU::V_MUL_F64_pseudo_e64: {
     switch (Val) {
     case 0x3fe0000000000000: // 0.5
       return SIOutMods::DIV2;
@@ -1618,6 +1620,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
   unsigned Op = MI.getOpcode();
   switch (Op) {
   case AMDGPU::V_MUL_F64_e64:
+  case AMDGPU::V_MUL_F64_pseudo_e64:
   case AMDGPU::V_MUL_F32_e64:
   case AMDGPU::V_MUL_F16_t16_e64:
   case AMDGPU::V_MUL_F16_fake16_e64:
@@ -1625,8 +1628,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
     // If output denormals are enabled, omod is ignored.
     if ((Op == AMDGPU::V_MUL_F32_e64 &&
          MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
-        ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
-          Op == AMDGPU::V_MUL_F16_t16_e64 ||
+        ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
+          Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
           Op == AMDGPU::V_MUL_F16_fake16_e64) &&
          MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
       return std::pair(nullptr, SIOutMods::NONE);
@@ -1655,6 +1658,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
     return std::pair(RegOp, OMod);
   }
   case AMDGPU::V_ADD_F64_e64:
+  case AMDGPU::V_ADD_F64_pseudo_e64:
   case AMDGPU::V_ADD_F32_e64:
   case AMDGPU::V_ADD_F16_e64:
   case AMDGPU::V_ADD_F16_t16_e64:
@@ -1662,8 +1666,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
     // If output denormals are enabled, omod is ignored.
     if ((Op == AMDGPU::V_ADD_F32_e64 &&
          MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
-        ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
-          Op == AMDGPU::V_ADD_F16_t16_e64 ||
+        ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
+          Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
           Op == AMDGPU::V_ADD_F16_fake16_e64) &&
          MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
       return std::pair(nullptr, SIOutMods::NONE);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b4bd46d33c1f10..ffa7952888c355 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1862,7 +1862,10 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
 >;
 
 def : ClampPat<V_MAX_F32_e64, f32>;
+let SubtargetPredicate = isNotGFX12Plus in
 def : ClampPat<V_MAX_F64_e64, f64>;
+let SubtargetPredicate = isGFX12Plus in
+def : ClampPat<V_MAX_NUM_F64_e64, f64>;
 let SubtargetPredicate = NotHasTrue16BitInsts in
 def : ClampPat<V_MAX_F16_e64, f16>;
 let SubtargetPredicate = UseRealTrue16Insts in
@@ -2990,10 +2993,12 @@ def : GCNPat<
 }
 
 // TODO: Handle fneg like other types.
+let SubtargetPredicate = isNotGFX12Plus in {
 def : GCNPat<
   (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
   (V_MUL_F64_e64  0, CONST.FP64_ONE, $src_mods, $src)
 >;
+}
 } // End AddedComplexity = -5
 
 multiclass SelectCanonicalizeAsMax<
@@ -3009,7 +3014,13 @@ multiclass SelectCanonicalizeAsMax<
   def : GCNPat<
     (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
     (V_MAX_F64_e64  $src_mods, $src, $src_mods, $src)> {
-    let OtherPredicates = f64_preds;
+    let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]);
+  }
+
+  def : GCNPat<
+    (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+    (V_MAX_NUM_F64_e64  $src_mods, $src, $src_mods, $src)> {
+    let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]);
   }
 
   def : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index b95231fd8880f5..92660b9a646ff3 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -857,7 +857,7 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1] clamp
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -938,7 +938,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1020,7 +1020,7 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
+; GFX12-NEXT:    v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 41b436473f6521..fc4bc7595da5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11PLUS,GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11PLUS,GFX12 %s
 
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.canonicalize.f32(float) #0
@@ -55,6 +56,19 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_var_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, ptr addrspace(1) %out
@@ -106,6 +120,17 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: s_test_canonicalize_var_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_max_num_f32_e64 v1, s2, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -147,6 +172,19 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fabs_var_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_max_num_f32_e64 v1, |v1|, |v1|
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
@@ -190,6 +228,19 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -|v1|, -|v1|
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %val.fabs.fneg = fneg float %val.fabs
@@ -234,6 +285,19 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: v_test_canonicalize_fneg_var_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_max_num_f32_e64 v1, -v1, -v1
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %out
   %val.fneg = fneg float %val
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
@@ -260,15 +324,15 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fold_canonicalize_undef_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_undef_f32:
+; GFX11PLUS:       ; %bb.0:
+; GFX11PLUS-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT:    s_nop 0
+; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float undef)
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -293,15 +357,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fold_canonicalize_p0_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_p0_f32:
+; GFX11PLUS:       ; %bb.0:
+; GFX11PLUS-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT:    s_nop 0
+; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -327,16 +391,16 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fold_canonicalize_n0_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_n0_f32:
+; GFX11PLUS:       ; %bb.0:
+; GFX11PLUS-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX11PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT:    s_nop 0
+; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -362,15 +426,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fold_canonicalize_p1_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_p1_f32:
+; GFX11PLUS:       ; %bb.0:
+; GFX11PLUS-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0
+; GFX11PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT:    s_nop 0
+; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -396,15 +460,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fold_canonicalize_n1_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_n1_f32:
+; GFX11PLUS:       ; %bb.0:
+; GFX11PLUS-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, -1.0
+; GFX11PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT:    s_nop 0
+; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -430,15 +494,15 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_fold_canonicalize_literal_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11PLUS-LABEL: test_fold_canonicalize_literal_f32:
+; GFX11PLUS:       ; %bb.0:
+; GFX11PLUS-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41800000
+; GFX11PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11PLUS-NEXT:    s_nop 0
+; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -463,15 +527,15 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11PLUS-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
+; GFX11PLUS:       ; %bb.0:
+; GFX11PLUS-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11PLUS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11PLUS-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11PLUS-NEXT:    s_nop 0
+; GFX11PLUS-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11PLUS-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -509,6 +573,17 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_max_num_f32_e64 v1, 0x7fffff, 0x7fffff
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, ptr addrspace(1) %out
   ret void
@@ -546,6 +621,17 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL:...
[truncated]

@jayfoad jayfoad merged commit 0a3a0ea into llvm:main Jan 18, 2024
3 of 4 checks passed
@jayfoad jayfoad deleted the gfx12-new-pseudos branch January 18, 2024 13:26
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

4 participants