-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Enable multi-group xnack replay in hardware (GFX1250) #169016
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This patch enables the multi-group xnack replay mode by configuring the hardware MODE register at kernel entry. This aligns the hardware behavior with the compiler's existing multi-group s_wait_xcnt insertion logic.
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Christudasan Devadasan (cdevadas) ChangesThis patch enables the multi-group xnack replay mode by Patch is 2.50 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169016.diff 149 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index aa5ea77f17291..ffbb111d42221 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -772,6 +772,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
PreloadedScratchRsrcReg,
ScratchRsrcReg, ScratchWaveOffsetReg);
}
+
+ if (ST.hasWaitXCnt()) {
+ // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
+ // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
+ // insertion logic, which assumes multi-group mode by default.
+ unsigned RegEncoding =
+ AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(1)
+ .addImm(RegEncoding);
+ }
}
// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index b5d593a9c15ed..58586129fb4e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -41,6 +41,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -71,6 +72,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -114,6 +116,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
;
; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -160,6 +163,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -190,6 +194,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -233,6 +238,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -279,6 +285,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -309,6 +316,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -352,6 +360,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
;
; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -397,6 +406,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -427,6 +437,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -470,6 +481,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -515,6 +527,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -545,6 +558,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -588,6 +602,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
;
; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -634,6 +649,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -664,6 +680,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -707,6 +724,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -753,6 +771,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -783,6 +802,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -826,6 +846,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
;
; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -871,6 +892,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -901,6 +923,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -944,6 +967,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -989,6 +1013,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1019,6 +1044,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
;
; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1062,6 +1088,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
;
; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1108,6 +1135,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1138,6 +1166,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1181,6 +1210,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
;
; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1227,6 +1257,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1257,6 +1288,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
;
; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1300,6 +1332,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
;
; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1345,6 +1378,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1375,6 +1409,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1]
@@ -1418,6 +1453,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
;
; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1486,6 +1522,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1558,6 +1595,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1632,6 +1670,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1704,6 +1743,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1894,6 +1934,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
;
; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-NEXT: s_mov_b32 s1, exec_lo
; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1949,6 +1990,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
@@ -1991,6 +2033,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
@@ -2035,6 +2078,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
@@ -2197,6 +2241,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
@@ -2255,6 +2300,7 @@ define amdgpu_kerne...
[truncated]
|
🐧 Linux x64 Test Results
|
jayfoad
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM

This patch enables the multi-group xnack replay mode by
configuring the hardware MODE register at kernel entry.
This aligns the hardware behavior with the compiler's
existing multi-group s_wait_xcnt insertion logic.