Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU][SIInsertWaitcnts] Set initial state for VS_CNT in non-kernel functions #75436

Merged
merged 1 commit into from
Dec 15, 2023

Conversation

Pierre-vh
Copy link
Contributor

Split from #72830

@llvmbot
Copy link
Collaborator

llvmbot commented Dec 14, 2023

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

Changes

Split from #72830


Patch is 27.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75436.diff

4 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+10)
  • (modified) llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll (+1)
  • (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll (-28)
  • (modified) llvm/test/CodeGen/AMDGPU/release-vgprs.mir (+99-226)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c16583f6a7f9ac..dfe67f4c189540 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -292,6 +292,11 @@ class WaitcntBrackets {
     VgprVmemTypes[GprNo] = 0;
   }
 
+  void setNonKernelFunctionInitialState() {
+    setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
+    PendingEvents |= WaitEventMaskForInst[VS_CNT];
+  }
+
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
@@ -1865,6 +1870,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
       ;
     BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
 
+    auto NonKernelInitialState =
+        std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+    NonKernelInitialState->setNonKernelFunctionInitialState();
+    BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+
     Modified = true;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
index 1ccd31e97a2aba..36ddd286155a34 100644
--- a/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/back-off-barrier-subtarget-feature.ll
@@ -55,6 +55,7 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
 ; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-BACKOFF-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX11-BACKOFF-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-BACKOFF-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-BACKOFF-NEXT:    s_barrier
 ; GFX11-BACKOFF-NEXT:    flat_store_b32 v[2:3], v0
 ; GFX11-BACKOFF-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
index 6fd6d6e2e31a1c..65b70587fa0ace 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll
@@ -17,8 +17,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b32 v[8:9], v0, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg:
@@ -39,8 +37,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v0, v10
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v0, off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg:
@@ -68,8 +64,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX11-NEXT:    s_not_b32 exec_lo, exec_lo
 ; GFX11-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg_64:
@@ -94,8 +88,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_64(ptr addrspace(1) %out, i6
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX11_W64-NEXT:    s_not_b64 exec, exec
 ; GFX11_W64-NEXT:    global_store_b64 v[8:9], v[0:1], off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg_64:
@@ -133,8 +125,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11-NEXT:    global_store_b32 v[8:9], v2, off
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: set_inactive_chain_arg_dpp:
@@ -174,8 +164,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_dpp(ptr addrspace(1) %out, i
 ; GFX11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX11_W64-NEXT:    global_store_b32 v[8:9], v2, off
-; GFX11_W64-NEXT:    s_nop 0
-; GFX11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11_W64-NEXT:    s_endpgm
 ;
 ; GFX10_W64-LABEL: set_inactive_chain_arg_dpp:
@@ -233,8 +221,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT:    s_nop 0
-; GISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11-NEXT:    s_endpgm
 ;
 ; DAGISEL11-LABEL: set_inactive_chain_arg_call:
@@ -265,8 +251,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT:    s_nop 0
-; DAGISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_call:
@@ -380,8 +364,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT:    s_nop 0
-; GISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11_W64-NEXT:    s_endpgm
 ;
 ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_call:
@@ -419,8 +401,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_call(ptr addrspace(1) %out,
 ; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT:    s_nop 0
-; DAGISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_call:
@@ -538,8 +518,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11-NEXT:    s_nop 0
-; GISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11-NEXT:    s_endpgm
 ;
 ; DAGISEL11-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -570,8 +548,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11-NEXT:    s_nop 0
-; DAGISEL11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11-NEXT:    s_endpgm
 ;
 ; GISEL10-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -685,8 +661,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; GISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; GISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; GISEL11_W64-NEXT:    s_nop 0
-; GISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GISEL11_W64-NEXT:    s_endpgm
 ;
 ; DAGISEL11_W64-LABEL: set_inactive_chain_arg_last_vgpr:
@@ -724,8 +698,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg_last_vgpr(ptr addrspace(1) %
 ; DAGISEL11_W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL11_W64-NEXT:    v_mov_b32_e32 v0, v12
 ; DAGISEL11_W64-NEXT:    global_store_b32 v[41:42], v0, off
-; DAGISEL11_W64-NEXT:    s_nop 0
-; DAGISEL11_W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; DAGISEL11_W64-NEXT:    s_endpgm
 ;
 ; GISEL10_W64-LABEL: set_inactive_chain_arg_last_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
index 3a879e818af797..e57fc0311bd3c6 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -march=amdgcn -mcpu=gfx1100 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,OPT
-# RUN: llc -O0 -march=amdgcn -mcpu=gfx1100 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,NOOPT
+# RUN: llc -O2 -march=amdgcn -mcpu=gfx1100 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -O0 -march=amdgcn -mcpu=gfx1100 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
 
 --- |
   define amdgpu_ps void @tbuffer_store1() { ret void }
@@ -28,17 +28,10 @@
 name:            tbuffer_store1
 body:             |
   bb.0:
-    ; OPT-LABEL: name: tbuffer_store1
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: tbuffer_store1
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: tbuffer_store1
+    ; CHECK: S_WAITCNT 0
+    ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
     TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
@@ -47,17 +40,10 @@ body:             |
 name:            tbuffer_store2
 body:             |
   bb.0:
-    ; OPT-LABEL: name: tbuffer_store2
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: tbuffer_store2
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: tbuffer_store2
+    ; CHECK: S_WAITCNT 0
+    ; CHECK-NEXT: TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+    ; CHECK-NEXT: S_ENDPGM 0
     TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
     S_ENDPGM 0
 ...
@@ -78,19 +64,11 @@ body:             |
 name:            global_store
 body:             |
   bb.0:
-    ; OPT-LABEL: name: global_store
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    ; OPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: global_store
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
-    ; NOOPT-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: global_store
+    ; CHECK: S_WAITCNT 0
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+    ; CHECK-NEXT: S_WAITCNT_VSCNT undef $sgpr_null, 0
+    ; CHECK-NEXT: S_ENDPGM 0
     GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
     S_WAITCNT_VSCNT undef $sgpr_null, 0
     S_ENDPGM 0
@@ -100,17 +78,10 @@ body:             |
 name:            buffer_store_format
 body:             |
   bb.0:
-    ; OPT-LABEL: name: buffer_store_format
-    ; OPT: S_WAITCNT 0
-    ; OPT-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: buffer_store_format
-    ; NOOPT: S_WAITCNT 0
-    ; NOOPT-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: buffer_store_format
+    ; CHECK: S_WAITCNT 0
+    ; CHECK-NEXT: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
     BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
     S_ENDPGM 0
 ...
@@ -137,23 +108,13 @@ body:             |
   bb.0:
     liveins: $vgpr0, $sgpr0_sgpr1
 
-    ; OPT-LABEL: name: global_store_dword
-    ; OPT: liveins: $vgpr0, $sgpr0_sgpr1
-    ; OPT-NEXT: {{  $}}
-    ; OPT-NEXT: S_WAITCNT 0
-    ; OPT-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
-    ; OPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    ; OPT-NEXT: S_NOP 0
-    ; OPT-NEXT: S_SENDMSG 3, implicit $exec, implicit $m0
-    ; OPT-NEXT: S_ENDPGM 0
-    ;
-    ; NOOPT-LABEL: name: global_store_dword
-    ; NOOPT: liveins: $vgpr0, $sgpr0_sgpr1
-    ; NOOPT-NEXT: {{  $}}
-    ; NOOPT-NEXT: S_WAITCNT 0
-    ; NOOPT-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
-    ; NOOPT-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
-    ; NOOPT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: global_store_dword
+    ; CHECK: liveins: $vgpr0, $sgpr0_sgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: S_WAITCNT 0
+    ; CHECK-NEXT: renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+    ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+    ; CHECK-NEXT: S_ENDPGM 0
     renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
     GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
     S_ENDPGM 0
@@ -206,45 +167,24 @@ body:             |
 ---
 name:            multiple_basic_blocks2
 body:             |
-  ; OPT-LABEL: name: multiple_basic_blocks2
-  ; OPT: bb.0:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_WAITCNT 0
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.1:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.2:
-  ; OPT-NEXT:   S_NOP 0
-  ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
-  ;
-  ; NOOPT-LABEL: name: multiple_basic_blocks2
-  ; NOOPT: bb.0:
-  ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_WAITCNT 0
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.1:
-  ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT: bb.2:
-  ; NOOPT-NEXT:   S_ENDPGM 0
+  ; CHECK-LABEL: name: multiple_basic_blocks2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_WAITCNT 0
+  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
+  ; CHECK-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.2
 
@@ -268,65 +208,34 @@ body:             |
 ---
 name:            multiple_basic_blocks3
 body:             |
-  ; OPT-LABEL: name: multiple_basic_blocks3
-  ; OPT: bb.0:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_WAITCNT 0
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.1:
-  ; OPT-NEXT:   successors: %bb.2(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.2
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.2:
-  ; OPT-NEXT:   successors: %bb.4(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   S_BRANCH %bb.4
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.3:
-  ; OPT-NEXT:   successors: %bb.4(0x80000000)
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; OPT-NEXT:   S_BRANCH %bb.4
-  ; OPT-NEXT: {{  $}}
-  ; OPT-NEXT: bb.4:
-  ; OPT-NEXT:   S_NOP 0
-  ; OPT-NEXT:   S_SENDMSG 3, implicit $exec, implicit $m0
-  ; OPT-NEXT:   S_ENDPGM 0
-  ;
-  ; NOOPT-LABEL: name: multiple_basic_blocks3
-  ; NOOPT: bb.0:
-  ; NOOPT-NEXT:   successors: %bb.2(0x80000000)
-  ; NOOPT-NEXT: {{  $}}
-  ; NOOPT-NEXT:   S_WAITCNT 0
-  ; NOOPT-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; NOOPT-NEXT:   TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
-  ; NOOPT-NEXT:   S_BRANCH %bb.2
-  ; NOOPT-NEXT: {{  $}}
-  ; NO...
[truncated]

@@ -55,6 +55,7 @@ define void @back_off_barrier_no_fence(ptr %in, ptr %out) #0 {
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-BACKOFF-NEXT: flat_load_b32 v0, v[0:1]
; GFX11-BACKOFF-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-BACKOFF-NEXT: s_waitcnt_vscnt null, 0x0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is a bug fix. @kerbowa would you agree? The code in generateWaitcntInstBefore certainly looks like it's trying to wait for all counters to be zero. So the bug was that previously, this pass wrongly assumed that vs_cnt would be zero on entry to a non-kernel function,

Copy link
Contributor

@jayfoad jayfoad left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM, thanks.

@@ -17,8 +17,6 @@ define amdgpu_cs_chain void @set_inactive_chain_arg(ptr addrspace(1) %out, i32 %
; GFX11-NEXT: v_mov_b32_e32 v0, v10
; GFX11-NEXT: s_not_b32 exec_lo, exec_lo
; GFX11-NEXT: global_store_b32 v[8:9], v0, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fixing the bug explained in #72245.

@Pierre-vh Pierre-vh merged commit f1ea77f into llvm:main Dec 15, 2023
5 checks passed
@Pierre-vh Pierre-vh deleted the swaitcnt-initial-state branch December 15, 2023 07:31
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants